From 366fc83bf2c2994bfef68191c0739f35aecc3ad2 Mon Sep 17 00:00:00 2001 From: "Zed A. Shaw" Date: Wed, 17 Aug 2022 21:00:47 -0400 Subject: [PATCH] Better parser that can do a Set instead of just a list of URLs in a chain. --- 02-filter-a-log-file/chains_parser.js | 238 +++++++++++++++++-------- 02-filter-a-log-file/package-lock.json | 18 ++ 02-filter-a-log-file/package.json | 1 + 3 files changed, 178 insertions(+), 79 deletions(-) diff --git a/02-filter-a-log-file/chains_parser.js b/02-filter-a-log-file/chains_parser.js index 33e02f2..27e8186 100644 --- a/02-filter-a-log-file/chains_parser.js +++ b/02-filter-a-log-file/chains_parser.js @@ -9,13 +9,20 @@ class Parser { this.next = ""; } - match(reg) { + match(reg, consume=true) { const n = this.next.match(reg); - const element = n.length > 1 ? n.slice(1) : n[0]; + if(n === null) { + return undefined; + } else { + const element = n.length > 1 ? n.slice(1) : n[0]; + + if(consume) { + this.next = this.next.slice(n[0].length); + } - this.next = this.next.slice(n[0].length); - return element; + return element; + } } start(line) { @@ -23,122 +30,195 @@ class Parser { this.next = line; } - ws() { - this.match(/ +/); + ws(consume=true) { + return this.match(/ +/, consume); } - parse() { - try { - const [ip, port, conn_id, conn_count] = this.match(/([0-9.]+):([0-9]+):([0-9]+):([0-9]+)/); + parse_new_log(ip) { + const [port, conn_id, conn_count] = this.match(/^([0-9]+):([0-9]+):([0-9]+)/); - this.ws(); + this.ws(); - const [ time ] = this.match(/\[(.*)\]/); + const [ time ] = this.match(/\[(.*)\]/); - this.ws(); + this.ws(); - const [ url ] = this.match(/"(.+?)"/); + const [ full_url ] = this.match(/"(.+?)"/); - this.ws(); + const [url, params] = full_url.split("?"); - const code = this.match(/\-|[0-9]+/); + this.ws(); - this.ws(); + const code = this.match(/\-|[0-9]+/); - const bytes = this.match(/\-|[0-9]+/); + this.ws(); - this.ws(); + const bytes = this.match(/\-|[0-9]+/); - const [refer] = this.match(/"(.+?)"/); + this.ws(); - this.ws(); + const [refer] = this.match(/"(.+?)"/); - const [ua] = this.match(/"(.+?)"/); + this.ws(); - return { - ip, - conn_id: parseInt(conn_id), - conn_count: parseInt(conn_count), - time: new Date(time), - url, - code: parseInt(code), - size: parseInt(bytes), - refer: refer === '-' ? undefined : refer, - ua: UAParser(ua) - }; - } catch(error) { - throw new Error(`Parsing Error: ${ this.next }`); - } + const [ua] = this.match(/"(.+?)"/); + + return { + ip, + conn_id: parseInt(conn_id), + conn_count: parseInt(conn_count), + time: new Date(time), + url, params, + code: parseInt(code), + size: parseInt(bytes), + refer: refer === '-' ? undefined : refer, + ua: UAParser(ua) + }; } -} -const read_stream = fs.createReadStream(process.argv[2]); + parse_old_log(ip) { + this.match(/- -/); + + this.ws(); + + // FORMAT: 29/Mar/2022:22:40:52 +0200 + const [ day, month, year, hour, minute, seconds, tz_offset ] = this.match(/\[([0-9]+)\/([A-Za-z]+)\/([0-9]+):([0-9]+):([0-9]+):([0-9]+) (.+?)\]/); + + this.ws(); -const rl = readline.createInterface({ - input: read_stream, - crlfDelay: Infinity -}); + const [ method, full_url, http_version ] = this.match(/"([A-Z]+) (.+) HTTP\/([0-9].[0-9])"/); -const parser = new Parser(); -const chains = {}; + const [ url, params ] = full_url.split("?"); -const skip = /(authcheck\/?|.*.svg|.*.webmanifest|.*.js|.*.css|.*.png|.*.txt|.*.woff|.*.jpg|.*.mp4|.*.torrent|\-|.*.ico|\/api\/.*\?.*|.*.html|.*.map)$/ + this.ws(); -for await (let line of rl) { - parser.start(line); + const code = this.match(/\-|[0-9]+/); - try { - const data = parser.parse(); + this.ws(); - if(data.ua.os && data.code === 200 && !data.url.match(skip)) { - let chain = data.ip in chains ? chains[data.ip] : []; + const bytes = this.match(/\-|[0-9]+/); - chain.push([data.time, data.url, data.refer]); + this.ws(); - chains[data.ip] = chain; + const [refer] = this.match(/"(.+?)"/); + + this.ws(); + + const [ua] = this.match(/"(.+?)"/); + + // this is another IP address sometimes in another log format that I'll ignore + const unknown = this.match(/".+?"$/); + + return { + ip, + method, + http_version, + time: new Date(`${day} ${month} ${year} ${hour}:${minute}:${seconds} ${tz_offset}`), + url, params, + code: parseInt(code), + size: parseInt(bytes), + refer: refer === '-' ? undefined : refer, + ua: UAParser(ua) + }; + } + + parse() { + const ip = this.match(/^[0-9\.]+/); + const test = this.match(/(:| )/); + + // BUG: uhh for some reason it needs == here? === says : doesn't equal : + if(test == ":") { + return this.parse_new_log(ip); + } else if(test == " ") { + return this.parse_old_log(ip); + } else { + // console.log(`PARSE ERROR, expected : or ' ' but got ${test}`); + return {}; } - } catch(err) { - if(line !== "") console.error(err); } } -const uniques = {}; +const parse_logs = async (file_name) => { + const read_stream = fs.createReadStream(file_name); -for(let key in chains) { - const chain = chains[key]; + const rl = readline.createInterface({ + input: read_stream, + crlfDelay: Infinity + }); - const first = chain[0][2]; + const parser = new Parser(); - const urls = chain.map(s => { - const ref = s[2]; - if(ref && ref !== first && !ref.includes("learnjsthehardway.com") ) { - return `${s[1]}(${s[2]})`; - } else { - return s[1]; + const stats = { + lines: 0, + chains: 0, + excluded: 0, + errors: 0, + roots: 0, + firsts: 0 + }; + + const skip = /(authcheck|.*\.svg|.*\.webmanifest|.*\.js|.*\.css|.*php|socket\.io|\.env|.*\.png|.*\.txt|.*\.woff|.*\.jpg|.*\.mp4|.*\.torrent|\-|.*\.ico|\/api\/.*\?.*|.*\.html|.*\.map|.*.php)/ + + const by_ip = {}; + + for await (let line of rl) { + try { + stats.lines += 1; + parser.start(line); + + const data = parser.parse(); + + // skip lines that have content we don't care about + if(data.url.match(skip)) continue; + + // store or update the chain in the by_ip chain + const ip_chain = by_ip[data.ip] || []; + + ip_chain.push(data); + + by_ip[data.ip] = ip_chain; + } catch(error) { + stats.errors += 1; } - }); + } + return [by_ip, stats]; +} + +const chain_to_set = (requests) => { + const path = new Set(); + + for(let r of requests) { + path.add(r.url); + } + + return path.values(); +} + +const sort_request_chains = (by_ip) => { + let ip_chains = {}; let seen; - let full = urls.filter((s, index, self) => { - if(s === seen) { - return false; - } else { - seen = s; - return true; - } - }).join(" "); + for(let [ip, requests] of Object.entries(by_ip)) { + const chain = chain_to_set(requests); + + const ref = requests[0].refer ? `[${requests[0].refer}]` : ""; + const url_set = [ref, ...chain].join(" "); - if(first) { - full = `[${first}] ${full}`; + ip_chains[url_set] = url_set in ip_chains ? ip_chains[url_set] + 1 : 1; } - uniques[full] = full in uniques ? uniques[full] + 1 : 1; + const chains_sorted = Object.entries(ip_chains); + chains_sorted.sort((a, b) => b[1] - a[1]); + + return chains_sorted; } -const sorted = Object.entries(uniques); -sorted.sort((a, b) => b[1] - a[1]); +const [by_ip, stats] = await parse_logs(process.argv[2]); +const chains_sorted = sort_request_chains(by_ip); -for(let [url, count] of sorted) { +for(let [url, count] of chains_sorted) { console.log(count, url); } + +console.log(stats); diff --git a/02-filter-a-log-file/package-lock.json b/02-filter-a-log-file/package-lock.json index bf77395..2d631b5 100644 --- a/02-filter-a-log-file/package-lock.json +++ b/02-filter-a-log-file/package-lock.json @@ -10,6 +10,7 @@ "license": "BSD", "dependencies": { "ava": "^4.3.1", + "date-fns": "^2.29.1", "ua-parser-js": "^1.0.2" } }, @@ -500,6 +501,18 @@ "node": ">=0.10.0" } }, + "node_modules/date-fns": { + "version": "2.29.1", + "resolved": "https://registry.npmjs.org/date-fns/-/date-fns-2.29.1.tgz", + "integrity": "sha512-dlLD5rKaKxpFdnjrs+5azHDFOPEu4ANy/LTh04A1DTzMM7qoajmKCBc8pkKRFT41CNzw+4gQh79X5C+Jq27HAw==", + "engines": { + "node": ">=0.11" + }, + "funding": { + "type": "opencollective", + "url": "https://opencollective.com/date-fns" + } + }, "node_modules/date-time": { "version": "3.1.0", "resolved": "https://registry.npmjs.org/date-time/-/date-time-3.1.0.tgz", @@ -2164,6 +2177,11 @@ "array-find-index": "^1.0.1" } }, + "date-fns": { + "version": "2.29.1", + "resolved": "https://registry.npmjs.org/date-fns/-/date-fns-2.29.1.tgz", + "integrity": "sha512-dlLD5rKaKxpFdnjrs+5azHDFOPEu4ANy/LTh04A1DTzMM7qoajmKCBc8pkKRFT41CNzw+4gQh79X5C+Jq27HAw==" + }, "date-time": { "version": "3.1.0", "resolved": "https://registry.npmjs.org/date-time/-/date-time-3.1.0.tgz", diff --git a/02-filter-a-log-file/package.json b/02-filter-a-log-file/package.json index 7fab2c0..9ff9d73 100644 --- a/02-filter-a-log-file/package.json +++ b/02-filter-a-log-file/package.json @@ -11,6 +11,7 @@ "license": "BSD", "dependencies": { "ava": "^4.3.1", + "date-fns": "^2.29.1", "ua-parser-js": "^1.0.2" } }