From 914af2272c7df85aaac799e3ab8185831403a404 Mon Sep 17 00:00:00 2001 From: "Zed A. Shaw" Date: Thu, 25 Aug 2022 16:36:32 -0400 Subject: [PATCH] Clean up the directory structure to improve the flow of the lesson more. --- .../parser.js} | 0 .../parser_incremental.js} | 0 .../chains_parser/parser_slow.js | 260 ++++++++++++++++++ .../reporter.js} | 0 4 files changed, 260 insertions(+) rename 02-filter-a-log-file/{chains_parser.js => chains_parser/parser.js} (100%) rename 02-filter-a-log-file/{chains_parser_incremental.js => chains_parser/parser_incremental.js} (100%) create mode 100644 02-filter-a-log-file/chains_parser/parser_slow.js rename 02-filter-a-log-file/{chains_reporter.js => chains_parser/reporter.js} (100%) diff --git a/02-filter-a-log-file/chains_parser.js b/02-filter-a-log-file/chains_parser/parser.js similarity index 100% rename from 02-filter-a-log-file/chains_parser.js rename to 02-filter-a-log-file/chains_parser/parser.js diff --git a/02-filter-a-log-file/chains_parser_incremental.js b/02-filter-a-log-file/chains_parser/parser_incremental.js similarity index 100% rename from 02-filter-a-log-file/chains_parser_incremental.js rename to 02-filter-a-log-file/chains_parser/parser_incremental.js diff --git a/02-filter-a-log-file/chains_parser/parser_slow.js b/02-filter-a-log-file/chains_parser/parser_slow.js new file mode 100644 index 0000000..b1d94c2 --- /dev/null +++ b/02-filter-a-log-file/chains_parser/parser_slow.js @@ -0,0 +1,260 @@ +import fs from "fs"; +import assert from "assert"; +import UAParser from "ua-parser-js"; +import readline from "readline"; +import { program } from "commander"; + +class Parser { + constructor() { + this.text = ""; + this.next = ""; + } + + match(reg, consume=true) { + const n = this.next.match(reg); + + if(n === null) { + return undefined; + } else { + const element = n.length > 1 ? n.slice(1) : n[0]; + + if(consume) { + this.next = this.next.slice(n[0].length); + } + + return element; + } + } + + start(line) { + this.text = line; + this.next = line; + } + + ws(consume=true) { + return this.match(/ +/, consume); + } + + parse_new_log(ip) { + const [port, conn_id, conn_count] = this.match(/^([0-9]+):([0-9]+):([0-9]+)/); + + this.ws(); + + const [ time ] = this.match(/\[(.*)\]/); + + this.ws(); + + const [ full_url ] = this.match(/"(.+?)"/); + + const [url, params] = full_url.split("?"); + + this.ws(); + + const code = this.match(/\-|[0-9]+/); + + this.ws(); + + const bytes = this.match(/\-|[0-9]+/); + + this.ws(); + + const [refer] = this.match(/"(.+?)"/); + + this.ws(); + + const [ua] = this.match(/"(.+?)"/); + + return { + ip, + conn_id: parseInt(conn_id), + conn_count: parseInt(conn_count), + time: new Date(time), + url, params, + code: parseInt(code), + size: parseInt(bytes), + refer: refer === '-' ? undefined : refer, + ua: UAParser(ua) + }; + } + + parse_old_log(ip) { + this.match(/- -/); + + this.ws(); + + // FORMAT: 29/Mar/2022:22:40:52 +0200 + const [ day, month, year, hour, minute, seconds, tz_offset ] = this.match(/\[([0-9]+)\/([A-Za-z]+)\/([0-9]+):([0-9]+):([0-9]+):([0-9]+) (.+?)\]/); + + this.ws(); + + const [ method, full_url, http_version ] = this.match(/"([A-Z]+) (.+) HTTP\/([0-9].[0-9])"/); + + const [ url, params ] = full_url.split("?"); + + this.ws(); + + const code = this.match(/\-|[0-9]+/); + + this.ws(); + + const bytes = this.match(/\-|[0-9]+/); + + this.ws(); + + const [refer] = this.match(/"(.+?)"/); + + this.ws(); + + const [ua] = this.match(/"(.+?)"/); + + // this is another IP address sometimes in another log format that I'll ignore + const unknown = this.match(/".+?"$/); + + return { + ip, + method, + http_version, + time: new Date(`${day} ${month} ${year} ${hour}:${minute}:${seconds} ${tz_offset}`), + url, params, + code: parseInt(code), + size: parseInt(bytes), + refer: refer === '-' ? undefined : refer, + ua: UAParser(ua) + }; + } + + parse() { + const ip = this.match(/^[0-9\.]+/); + const test = this.match(/(:| )/); + + // BUG: uhh for some reason it needs == here? === says : doesn't equal : + if(test == ":") { + return this.parse_new_log(ip); + } else if(test == " ") { + return this.parse_old_log(ip); + } else { + // console.log(`PARSE ERROR, expected : or ' ' but got ${test}`); + return {}; + } + } +} + +const parse_logs = async (file_name, errors) => { + const read_stream = fs.createReadStream(file_name); + + const rl = readline.createInterface({ + input: read_stream, + crlfDelay: Infinity + }); + + const parser = new Parser(); + + const stats = { + lines: 0, + chains: 0, + excluded: 0, + errors: 0, + roots: 0, + firsts: 0 + }; + + const skip = /(authcheck|.*\.svg|.*\.webmanifest|.*\.js|.*\.css|.*php|socket\.io|\.env|.*\.png|.*\.txt|.*\.woff|.*\.jpg|.*\.mp4|.*\.torrent|\-|.*\.ico|\/api\/.*\?.*|.*\.html|.*\.map|.*.php)/ + + const by_ip = {}; + + for await (let line of rl) { + try { + stats.lines += 1; + parser.start(line); + + const data = parser.parse(); + + // skip lines that have content we don't care about + if(data.url.match(skip)) continue; + + // store or update the chain in the by_ip chain + const ip_chain = by_ip[data.ip] || []; + + ip_chain.push(data); + + by_ip[data.ip] = ip_chain; + } catch(error) { + if(errors) console.error(error); + + stats.errors += 1; + } + } + + return [by_ip, stats]; +} + +const chain_to_set = (requests) => { + const path = new Set(); + + for(let r of requests) { + path.add(r.url); + } + + return path.values(); +} + +const chain_to_list = (requests) => { + const path = []; + let seen; + + for(let r of requests) { + if(r.url != seen) { + path.push(r.url); + seen = r.url; + } + } + + return path.values(); +} + +const sort_request_chains = (by_ip, as_set) => { + let ip_chains = {}; + let seen; + + for(let [ip, requests] of Object.entries(by_ip)) { + const chain = as_set ? chain_to_set(requests) : chain_to_list(requests); + + const ref = requests[0].refer ? `[${requests[0].refer}]` : ""; + const url_set = [ref, ...chain].join(" "); + + ip_chains[url_set] = url_set in ip_chains ? ip_chains[url_set] + 1 : 1; + } + + const chains_sorted = Object.entries(ip_chains); + chains_sorted.sort((a, b) => b[1] - a[1]); + + return chains_sorted; +} + +const output_results = (min, chains_sorted) => { + for(let [url, count] of chains_sorted) { + if(count >= min) { + console.log(count, url); + } + } + + console.log(stats); +} + +program + .option("--no-set", "Use a Set instead of a list for chains.") + .option("--min ", "The lowest count to print. Stop at this.", 1) + .option("--errors", "Show the erorrs so you can fix them.", false) + .requiredOption("--input ", "Input file.") + .description("Processes different web server logs to determine request chain frequency.") + .version(0.1); + +program.parse(); +const OPTS = program.opts(); +OPTS.min = parseInt(OPTS.min); + +assert(!isNaN(OPTS.min), `min must be a number, you have ${OPTS.min}`); + +const [by_ip, stats] = await parse_logs(OPTS.input, OPTS.errors); +const chains_sorted = sort_request_chains(by_ip, OPTS.set); +output_results(OPTS.min, chains_sorted); diff --git a/02-filter-a-log-file/chains_reporter.js b/02-filter-a-log-file/chains_parser/reporter.js similarity index 100% rename from 02-filter-a-log-file/chains_reporter.js rename to 02-filter-a-log-file/chains_parser/reporter.js