diff --git a/02-filter-a-log-file/chains_parser.js b/02-filter-a-log-file/chains_parser.js index 1081367..102ff85 100644 --- a/02-filter-a-log-file/chains_parser.js +++ b/02-filter-a-log-file/chains_parser.js @@ -3,6 +3,23 @@ import assert from "assert"; import UAParser from "ua-parser-js"; import readline from "readline"; import { program } from "commander"; +import glob from "fast-glob"; + + +program + .option("--min ", "The lowest count to print. Stop at this.", 1) + .option("--errors", "Show the erorrs so you can fix them.", false) + .option("--format ", "Output format, text or json. Ignores min for raw output.", "json") + .option("--outfile ", "Save to file rather than stdout.") + .requiredOption("--domain ", "Domain for the log. Gets removed as a refer.") + .requiredOption("--input ", "Input file glob.") + .description("Processes different web server logs to determine request chain frequency.") + .version(0.1); + +program.parse(); +const OPTS = program.opts(); +OPTS.min = parseInt(OPTS.min); + class Parser { constructor() { @@ -139,7 +156,7 @@ class Parser { } } -const parse_logs = async (file_name, errors) => { +const parse_log_file = async (results, stats, file_name, errors) => { const read_stream = fs.createReadStream(file_name); const rl = readline.createInterface({ @@ -149,19 +166,8 @@ const parse_logs = async (file_name, errors) => { const parser = new Parser(); - const stats = { - lines: 0, - chains: 0, - excluded: 0, - errors: 0, - roots: 0, - firsts: 0 - }; - const skip = /(authcheck|.*\.svg|.*\.webmanifest|.*\.js|.*\.css|.*php|socket\.io|\.env|.*\.png|.*\.txt|.*\.woff|.*\.jpg|.*\.mp4|.*\.torrent|\-|.*\.ico|\/api\/.*\?.*|.*\.html|.*\.map|.*.php)/ - const by_ip = {}; - for await (let line of rl) { try { stats.lines += 1; @@ -173,40 +179,43 @@ const parse_logs = async (file_name, errors) => { if(data.url.match(skip)) continue; // store or update the chain in the by_ip chain - const ip_chain = by_ip[data.ip] || []; + const ip_chain = results[data.ip] || []; ip_chain.push(data); - by_ip[data.ip] = ip_chain; + results[data.ip] = ip_chain; } catch(error) { if(errors) console.error(error); stats.errors += 1; } } - - return [by_ip, stats]; } -const chain_to_set = (requests) => { - const path = new Set(); +const parse_logs_glob = async (file_glob, errors) => { + const file_list = glob.sync(file_glob); + const results = {}; + const stats = { + lines: 0, + chains: 0, + excluded: 0, + errors: 0, + roots: 0, + firsts: 0 + }; - for(let r of requests) { - path.add(r.url); + for(let file_name of file_list) { + await parse_log_file(results, stats, file_name, errors); } - return path.values(); + return [results, stats]; } -const chain_to_list = (requests) => { - const path = []; - let seen; +const chain_to_set = (requests) => { + const path = new Set(); for(let r of requests) { - if(r.url != seen) { - path.push(r.url); - seen = r.url; - } + path.add(r.url); } return path.values(); @@ -221,11 +230,11 @@ const construct_url_set = (domain, ref, full_chain) => { } } -const construct_request_chains = (by_ip, domain, as_set) => { +const construct_request_chains = (by_ip, domain) => { let ip_chains = {}; for(let [ip, requests] of Object.entries(by_ip)) { - const chain = as_set ? chain_to_set(requests) : chain_to_list(requests); + const chain = chain_to_set(requests); // record the initial refer to track entry to the site const ref = requests[0].refer; @@ -294,33 +303,14 @@ const write_results = async (stats, chains, format, outfile) => { fs.closeSync(fd); } -program - .option("--no-set", "Use a Set instead of a list for chains.") - .option("--min ", "The lowest count to print. Stop at this.", 1) - .option("--errors", "Show the erorrs so you can fix them.", false) - .option("--format ", "Output format, text or json. Ignores min for raw output.", "json") - .option("--outfile ", "Save to file rather than stdout.") - .requiredOption("--domain ", "Domain for the log. Gets removed as a refer.") - .requiredOption("--input ", "Input file.") - .description("Processes different web server logs to determine request chain frequency.") - .version(0.1); - -program.parse(); -const OPTS = program.opts(); -OPTS.min = parseInt(OPTS.min); assert(!isNaN(OPTS.min), `min must be a number, you have ${OPTS.min}`); -try { - const [by_ip, stats] = await parse_logs(OPTS.input, OPTS.errors); - const chains = construct_request_chains(by_ip, OPTS.domain, OPTS.set); +const [by_ip, stats] = await parse_logs_glob(OPTS.input, OPTS.errors); +const chains = construct_request_chains(by_ip, OPTS.domain); - if(OPTS.outfile) { - write_results(stats, chains, OPTS.format, OPTS.outfile); - } else { - output_results(stats, chains, OPTS.format, OPTS.outfile); - } -} catch(error) { - console.error(error.message); - process.exit(1); +if(OPTS.outfile) { + write_results(stats, chains, OPTS.format, OPTS.outfile); +} else { + output_results(stats, chains, OPTS.format, OPTS.outfile); } diff --git a/02-filter-a-log-file/package-lock.json b/02-filter-a-log-file/package-lock.json index 1978b25..f686e90 100644 --- a/02-filter-a-log-file/package-lock.json +++ b/02-filter-a-log-file/package-lock.json @@ -12,6 +12,7 @@ "ava": "^4.3.1", "commander": "^9.4.0", "date-fns": "^2.29.1", + "fast-glob": "^3.2.11", "ua-parser-js": "^1.0.2" } }, diff --git a/02-filter-a-log-file/package.json b/02-filter-a-log-file/package.json index 35dad40..b9e990c 100644 --- a/02-filter-a-log-file/package.json +++ b/02-filter-a-log-file/package.json @@ -13,6 +13,7 @@ "ava": "^4.3.1", "commander": "^9.4.0", "date-fns": "^2.29.1", + "fast-glob": "^3.2.11", "ua-parser-js": "^1.0.2" } }