Better parser that can do a Set instead of just a list of URLs in a chain.

3 years ago · 366fc83bf2
parent 503007149f
commit 366fc83bf2
3 changed files with 178 additions and 79 deletions
--- a/02-filter-a-log-file/chains_parser.js
+++ b/02-filter-a-log-file/chains_parser.js
@ -9,13 +9,20 @@ class Parser {
    this.next = "";
  }
-  match(reg) {
+  match(reg, consume=true) {
    const n = this.next.match(reg);
-    const element = n.length > 1 ? n.slice(1) : n[0];
+    if(n === null) {
      return undefined;
    } else {
      const element = n.length > 1 ? n.slice(1) : n[0];
      if(consume) {
        this.next = this.next.slice(n[0].length);
      }
-    this.next = this.next.slice(n[0].length);
+      return element;
-    return element;
+    }
  }
  start(line) {
@ -23,122 +30,195 @@ class Parser {
    this.next = line;
  }
-  ws() {
+  ws(consume=true) {
-    this.match(/ +/);
+    return this.match(/ +/, consume);
  }
-  parse() {
+  parse_new_log(ip) {
-    try {
+    const [port, conn_id, conn_count] = this.match(/^([0-9]+):([0-9]+):([0-9]+)/);
      const [ip, port, conn_id, conn_count] = this.match(/([0-9.]+):([0-9]+):([0-9]+):([0-9]+)/);
-      this.ws();
+    this.ws();
-      const [ time ] = this.match(/\[(.*)\]/);
+    const [ time ] = this.match(/\[(.*)\]/);
-      this.ws();
+    this.ws();
-      const [ url ] = this.match(/"(.+?)"/);
+    const [ full_url ] = this.match(/"(.+?)"/);
-      this.ws();
+    const [url, params] = full_url.split("?");
-      const code = this.match(/\-|[0-9]+/);
+    this.ws();
-      this.ws();
+    const code = this.match(/\-|[0-9]+/);
-      const bytes = this.match(/\-|[0-9]+/);
+    this.ws();
-      this.ws();
+    const bytes = this.match(/\-|[0-9]+/);
-      const [refer] = this.match(/"(.+?)"/);
+    this.ws();
-      this.ws();
+    const [refer] = this.match(/"(.+?)"/);
-      const [ua] = this.match(/"(.+?)"/);
+    this.ws();
-      return {
+    const [ua] = this.match(/"(.+?)"/);
-        ip,
+
-        conn_id: parseInt(conn_id),
+    return {
-        conn_count: parseInt(conn_count),
+      ip,
-        time: new Date(time),
+      conn_id: parseInt(conn_id),
-        url,
+      conn_count: parseInt(conn_count),
-        code: parseInt(code),
+      time: new Date(time),
-        size: parseInt(bytes),
+      url, params,
-        refer: refer === '-' ? undefined : refer,
+      code: parseInt(code),
-        ua: UAParser(ua)
+      size: parseInt(bytes),
-      };
+      refer: refer === '-' ? undefined : refer,
-    } catch(error) {
+      ua: UAParser(ua)
-      throw new Error(`Parsing Error: ${ this.next }`);
+    };
    }
  }
 }
-const read_stream = fs.createReadStream(process.argv[2]);
+  parse_old_log(ip) {
    this.match(/- -/);
    this.ws();
    // FORMAT: 29/Mar/2022:22:40:52 +0200
    const [ day, month, year, hour, minute, seconds, tz_offset ] = this.match(/\[([0-9]+)\/([A-Za-z]+)\/([0-9]+):([0-9]+):([0-9]+):([0-9]+) (.+?)\]/);
    this.ws();
-const rl = readline.createInterface({
+    const [ method, full_url, http_version ] = this.match(/"([A-Z]+) (.+) HTTP\/([0-9].[0-9])"/);
  input: read_stream,
  crlfDelay: Infinity
 });
-const parser = new Parser();
+    const [ url, params ] = full_url.split("?");
 const chains = {};
-const skip = /(authcheck\/?|.*.svg|.*.webmanifest|.*.js|.*.css|.*.png|.*.txt|.*.woff|.*.jpg|.*.mp4|.*.torrent|\-|.*.ico|\/api\/.*\?.*|.*.html|.*.map)$/
+    this.ws();
-for await (let line of rl) {
+    const code = this.match(/\-|[0-9]+/);
  parser.start(line);
-  try {
+    this.ws();
    const data = parser.parse();
-    if(data.ua.os && data.code === 200 && !data.url.match(skip)) {
+    const bytes = this.match(/\-|[0-9]+/);
      let chain = data.ip in chains ? chains[data.ip] : [];
-      chain.push([data.time, data.url, data.refer]);
+    this.ws();
-      chains[data.ip] = chain;
+    const [refer] = this.match(/"(.+?)"/);
    this.ws();
    const [ua] = this.match(/"(.+?)"/);
    // this is another IP address sometimes in another log format that I'll ignore
    const unknown = this.match(/".+?"$/);
    return {
      ip,
      method,
      http_version,
      time: new Date(`${day} ${month} ${year} ${hour}:${minute}:${seconds} ${tz_offset}`),
      url, params,
      code: parseInt(code),
      size: parseInt(bytes),
      refer: refer === '-' ? undefined : refer,
      ua: UAParser(ua)
    };
  }
  parse() {
    const ip = this.match(/^[0-9\.]+/);
    const test = this.match(/(:| )/);
    // BUG: uhh for some reason it needs == here? === says : doesn't equal :
    if(test == ":") {
      return this.parse_new_log(ip);
    } else if(test == " ") {
      return this.parse_old_log(ip);
    } else {
      // console.log(`PARSE ERROR, expected : or ' ' but got ${test}`);
      return {};
    }
  } catch(err) {
    if(line !== "") console.error(err);
  }
 }
-const uniques = {};
+const parse_logs = async (file_name) => {
  const read_stream = fs.createReadStream(file_name);
-for(let key in chains) {
+  const rl = readline.createInterface({
-  const chain = chains[key];
+    input: read_stream,
    crlfDelay: Infinity
  });
-  const first = chain[0][2];
+  const parser = new Parser();
-  const urls = chain.map(s => {
+  const stats = {
-    const ref = s[2];
+    lines: 0,
-    if(ref && ref !== first && !ref.includes("learnjsthehardway.com") ) {
+    chains: 0,
-      return `${s[1]}(${s[2]})`;
+    excluded: 0,
-    } else {
+    errors: 0,
-      return s[1];
+    roots: 0,
    firsts: 0
  };
  const skip = /(authcheck|.*\.svg|.*\.webmanifest|.*\.js|.*\.css|.*php|socket\.io|\.env|.*\.png|.*\.txt|.*\.woff|.*\.jpg|.*\.mp4|.*\.torrent|\-|.*\.ico|\/api\/.*\?.*|.*\.html|.*\.map|.*.php)/
  const by_ip = {};
  for await (let line of rl) {
    try {
      stats.lines += 1;
      parser.start(line);
      const data = parser.parse();
      // skip lines that have content we don't care about
      if(data.url.match(skip)) continue;
      // store or update the chain in the by_ip chain
      const ip_chain = by_ip[data.ip] || [];
      ip_chain.push(data);
      by_ip[data.ip] = ip_chain;
    } catch(error) {
      stats.errors += 1;
    }
-  });
+  }
  return [by_ip, stats];
 }
 const chain_to_set = (requests) => {
  const path = new Set();
  for(let r of requests) {
    path.add(r.url);
  }
  return path.values();
 }
 const sort_request_chains = (by_ip) => {
  let ip_chains = {};
  let seen;
-  let full = urls.filter((s, index, self) => {
+  for(let [ip, requests] of Object.entries(by_ip)) {
-    if(s === seen) {
+    const chain = chain_to_set(requests);
-      return false;
+
-    } else {
+    const ref = requests[0].refer ? `[${requests[0].refer}]` : "";
-      seen = s;
+    const url_set = [ref, ...chain].join(" ");
      return true;
    }
  }).join(" ");
-  if(first) {
+    ip_chains[url_set] = url_set in ip_chains ? ip_chains[url_set] + 1 : 1;
    full = `[${first}] ${full}`;
  }
-  uniques[full] = full in uniques ? uniques[full] + 1 : 1;
+  const chains_sorted = Object.entries(ip_chains);
  chains_sorted.sort((a, b) => b[1] - a[1]);
  return chains_sorted;
 }
-const sorted = Object.entries(uniques);
+const [by_ip, stats] = await parse_logs(process.argv[2]);
-sorted.sort((a, b) => b[1] - a[1]);
+const chains_sorted = sort_request_chains(by_ip);
-for(let [url, count] of sorted) {
+for(let [url, count] of chains_sorted) {
  console.log(count, url);
 }
 console.log(stats);
--- a/02-filter-a-log-file/package-lock.json
+++ b/02-filter-a-log-file/package-lock.json
@ -10,6 +10,7 @@
      "license": "BSD",
      "dependencies": {
        "ava": "^4.3.1",
        "date-fns": "^2.29.1",
        "ua-parser-js": "^1.0.2"
      }
    },
@ -500,6 +501,18 @@
        "node": ">=0.10.0"
      }
    },
    "node_modules/date-fns": {
      "version": "2.29.1",
      "resolved": "https://registry.npmjs.org/date-fns/-/date-fns-2.29.1.tgz",
      "integrity": "sha512-dlLD5rKaKxpFdnjrs+5azHDFOPEu4ANy/LTh04A1DTzMM7qoajmKCBc8pkKRFT41CNzw+4gQh79X5C+Jq27HAw==",
      "engines": {
        "node": ">=0.11"
      },
      "funding": {
        "type": "opencollective",
        "url": "https://opencollective.com/date-fns"
      }
    },
    "node_modules/date-time": {
      "version": "3.1.0",
      "resolved": "https://registry.npmjs.org/date-time/-/date-time-3.1.0.tgz",
@ -2164,6 +2177,11 @@
        "array-find-index": "^1.0.1"
      }
    },
    "date-fns": {
      "version": "2.29.1",
      "resolved": "https://registry.npmjs.org/date-fns/-/date-fns-2.29.1.tgz",
      "integrity": "sha512-dlLD5rKaKxpFdnjrs+5azHDFOPEu4ANy/LTh04A1DTzMM7qoajmKCBc8pkKRFT41CNzw+4gQh79X5C+Jq27HAw=="
    },
    "date-time": {
      "version": "3.1.0",
      "resolved": "https://registry.npmjs.org/date-time/-/date-time-3.1.0.tgz",
--- a/02-filter-a-log-file/package.json
+++ b/02-filter-a-log-file/package.json
@ -11,6 +11,7 @@
  "license": "BSD",
  "dependencies": {
    "ava": "^4.3.1",
    "date-fns": "^2.29.1",
    "ua-parser-js": "^1.0.2"
  }
 }