Next version of the parser will use only sets, which turns out to work almost the same as maintaining a list but is faster and doesn't care about order of the events coming in.

3 years ago · afe3cfca47
parent cf527da326
commit afe3cfca47
1 changed files with 46 additions and 12 deletions
--- a/02-filter-a-log-file/chains_parser.js
+++ b/02-filter-a-log-file/chains_parser.js
@ -212,27 +212,53 @@ const chain_to_list = (requests) => {
  return path.values();
 }
-const sort_request_chains = (by_ip, as_set) => {
+const construct_url_set = (domain, ref, full_chain) => {
  // this tags the chains with refer using [ref]
  if(ref && !ref.includes(domain)) {
    return [`[${ref}]`, ...full_chain.slice(1)].join(" ");
  } else {
    return full_chain.join(" ");
  }
 }
 const construct_request_chains = (by_ip, domain, as_set) => {
  let ip_chains = {};
  let seen;
  for(let [ip, requests] of Object.entries(by_ip)) {
    const chain = as_set ? chain_to_set(requests) : chain_to_list(requests);
-    const ref = requests[0].refer ? `[${requests[0].refer}]` : "";
+    // record the initial refer to track entry to the site
-    const url_set = [ref, ...chain].join(" ");
+    const ref = requests[0].refer;
    const full_chain = [...chain];
    const url_set = construct_url_set(domain, ref, full_chain);
-    ip_chains[url_set] = url_set in ip_chains ? ip_chains[url_set] + 1 : 1;
+    // using url as key to count,full_chain
    if(url_set in ip_chains) {
      ip_chains[url_set].count += 1;
    } else {
      ip_chains[url_set] = { count: 1, comes_from: ref, full_chain};
    }
  }
-  const chains_sorted = Object.entries(ip_chains);
+  return ip_chains;
-  chains_sorted.sort((a, b) => b[1] - a[1]);
+}
-  return chains_sorted;
+const sort_request_chains = (chains) => {
  const converted = [];
  for(let [url, stats] of Object.entries(chains)) {
    if(stats.comes_from) {
      converted.push([stats.count, `[${stats.comes_from}] ${stats.full_chain.join(' ')}`]);
    } else {
      converted.push([stats.count, `${stats.full_chain.join(' ')}`]);
    }
  }
  return converted.sort((a, b) => b[0] - a[0]);
 }
-const output_results = (min, chains_sorted) => {
+const output_results = (chains_sorted, min) => {
-  for(let [url, count] of chains_sorted) {
+  for(let [count, url] of chains_sorted) {
    if(count >= min) {
      console.log(count, url);
    }
@ -245,6 +271,8 @@ program
  .option("--no-set", "Use a Set instead of a list for chains.")
  .option("--min <Number>", "The lowest count to print. Stop at this.", 1)
  .option("--errors", "Show the erorrs so you can fix them.", false)
  .option("--format <string>", "Output format, text or json. Ignores min for raw output.", "json")
  .requiredOption("--domain <String>", "Domain for the log. Gets removed as a refer.")
  .requiredOption("--input <String>", "Input file.")
  .description("Processes different web server logs to determine request chain frequency.")
  .version(0.1);
@ -256,5 +284,11 @@ OPTS.min = parseInt(OPTS.min);
 assert(!isNaN(OPTS.min), `min must be a number, you have ${OPTS.min}`);
 const [by_ip, stats] = await parse_logs(OPTS.input, OPTS.errors);
-const chains_sorted = sort_request_chains(by_ip, OPTS.set);
+const chains = construct_request_chains(by_ip, OPTS.domain, OPTS.set);
-output_results(OPTS.min, chains_sorted);
+const chains_sorted = sort_request_chains(chains);
 if(OPTS.format === "json") {
  console.log(chains);
 } else {
  output_results(chains_sorted, OPTS.min);
 }