import fs from "fs"; import assert from "assert"; import UAParser from "ua-parser-js"; import readline from "readline"; import { program } from "commander"; import glob from "fast-glob"; program .option("--min ", "The lowest count to print. Stop at this.", 1) .option("--errors", "Show the erorrs so you can fix them.", false) .option("--format ", "Output format, text or json. Ignores min for raw output.", "json") .option("--outfile ", "Save to file rather than stdout.") .requiredOption("--domain ", "Domain for the log. Gets removed as a refer.") .requiredOption("--input ", "Input file glob.") .description("Processes different web server logs to determine request chain frequency.") .version(0.1); program.parse(); const OPTS = program.opts(); OPTS.min = parseInt(OPTS.min); class Parser { constructor() { this.text = ""; this.next = ""; } match(reg, consume=true) { const n = this.next.match(reg); if(n === null) { return undefined; } else { const element = n.length > 1 ? n.slice(1) : n[0]; if(consume) { this.next = this.next.slice(n[0].length); } return element; } } start(line) { this.text = line; this.next = line; } ws(consume=true) { return this.match(/ +/, consume); } parse_new_log(ip) { const [port, conn_id, conn_count] = this.match(/^([0-9]+):([0-9]+):([0-9]+)/); this.ws(); const [ time ] = this.match(/\[(.*)\]/); this.ws(); const [ full_url ] = this.match(/"(.+?)"/); const [url, params] = full_url.split("?"); this.ws(); const code = this.match(/\-|[0-9]+/); this.ws(); const bytes = this.match(/\-|[0-9]+/); this.ws(); const [refer] = this.match(/"(.+?)"/); this.ws(); const [ua] = this.match(/"(.+?)"/); return { ip, conn_id: parseInt(conn_id), conn_count: parseInt(conn_count), time: new Date(time), url, params, code: parseInt(code), size: parseInt(bytes), refer: refer === '-' ? undefined : refer, ua: UAParser(ua) }; } parse_old_log(ip) { this.match(/- -/); this.ws(); // FORMAT: 29/Mar/2022:22:40:52 +0200 const [ day, month, year, hour, minute, seconds, tz_offset ] = this.match(/\[([0-9]+)\/([A-Za-z]+)\/([0-9]+):([0-9]+):([0-9]+):([0-9]+) (.+?)\]/); this.ws(); const [ method, full_url, http_version ] = this.match(/"([A-Z]+) (.+) HTTP\/([0-9].[0-9])"/); const [ url, params ] = full_url.split("?"); this.ws(); const code = this.match(/\-|[0-9]+/); this.ws(); const bytes = this.match(/\-|[0-9]+/); this.ws(); const [refer] = this.match(/"(.+?)"/); this.ws(); const [ua] = this.match(/"(.+?)"/); // this is another IP address sometimes in another log format that I'll ignore const unknown = this.match(/".+?"$/); return { ip, method, http_version, time: new Date(`${day} ${month} ${year} ${hour}:${minute}:${seconds} ${tz_offset}`), url, params, code: parseInt(code), size: parseInt(bytes), refer: refer === '-' ? undefined : refer, ua: UAParser(ua) }; } parse() { const ip = this.match(/^[0-9\.]+/); const test = this.match(/(:| )/); // BUG: uhh for some reason it needs == here? === says : doesn't equal : if(test == ":") { return this.parse_new_log(ip); } else if(test == " ") { return this.parse_old_log(ip); } else { // console.log(`PARSE ERROR, expected : or ' ' but got ${test}`); return {}; } } } const parse_log_file = async (results, stats, file_name, errors) => { const read_stream = fs.createReadStream(file_name); const rl = readline.createInterface({ input: read_stream, crlfDelay: Infinity }); const parser = new Parser(); const skip = /(authcheck|.*\.svg|.*\.webmanifest|.*\.js|.*\.css|.*php|socket\.io|\.env|.*\.png|.*\.txt|.*\.woff|.*\.jpg|.*\.mp4|.*\.torrent|\-|.*\.ico|\/api\/.*\?.*|.*\.html|.*\.map|.*.php)/ for await (let line of rl) { try { stats.lines += 1; parser.start(line); const data = parser.parse(); // skip lines that have content we don't care about if(data.url.match(skip)) continue; // store or update the chain in the by_ip chain const ip_chain = results[data.ip] || []; ip_chain.push(data); results[data.ip] = ip_chain; } catch(error) { if(errors) console.error(error); stats.errors += 1; } } } const parse_logs_glob = async (file_glob, errors) => { const file_list = glob.sync(file_glob); const results = {}; const stats = { lines: 0, chains: 0, excluded: 0, errors: 0, roots: 0, firsts: 0 }; for(let file_name of file_list) { await parse_log_file(results, stats, file_name, errors); } return [results, stats]; } const chain_to_set = (requests) => { const path = new Set(); for(let r of requests) { path.add(r.url); } return path.values(); } const construct_url_set = (domain, ref, full_chain) => { // this tags the chains with refer using [ref] if(ref && !ref.includes(domain)) { return [`[${ref}]`, ...full_chain.slice(1)].join(" "); } else { return full_chain.join(" "); } } const construct_request_chains = (by_ip, domain) => { let ip_chains = {}; for(let [ip, requests] of Object.entries(by_ip)) { const chain = chain_to_set(requests); // record the initial refer to track entry to the site const ref = requests[0].refer; const full_chain = [...chain]; const url_set = construct_url_set(domain, ref, full_chain); // using url as key to count,full_chain if(url_set in ip_chains) { ip_chains[url_set].count += 1; } else { ip_chains[url_set] = { count: 1, comes_from: ref, full_chain}; } } return ip_chains; } const sort_request_chains = (chains, min) => { const converted = []; for(let [url, stats] of Object.entries(chains)) { if(stats.count < min) continue; // skip below min if(stats.comes_from) { converted.push([stats.count, `[${stats.comes_from}] ${stats.full_chain.join(' ')}`]); } else { converted.push([stats.count, `${stats.full_chain.join(' ')}`]); } } return converted.sort((a, b) => b[0] - a[0]); } const output_results = async (stats, chains, format, outfile) => { if(format === "json") { const data = {stats, chains, date: new Date()}; console.log(data); } else { const chains_sorted = sort_request_chains(chains, OPTS.min); for(let [count, url] of chains_sorted) { console.log(count, url); } console.log(stats); } } const write_results = async (stats, chains, format, outfile) => { assert(outfile, "Output file required."); // unlike unix APIs this uses exceptions rather than return values for errors const fd = fs.openSync(outfile, "w+"); if(format === "json") { const data = {stats, chains, date: new Date()}; const bytes = fs.writeSync(fd, Buffer.from(JSON.stringify(data, null, 4)), 0); } else { const chains_sorted = sort_request_chains(chains, OPTS.min); for(let [count, url] of chains_sorted) { const bytes = fs.writeSync(fd, Buffer.from(`${count} ${url}\n`)); } } fs.closeSync(fd); } assert(!isNaN(OPTS.min), `min must be a number, you have ${OPTS.min}`); const [by_ip, stats] = await parse_logs_glob(OPTS.input, OPTS.errors); const chains = construct_request_chains(by_ip, OPTS.domain); if(OPTS.outfile) { write_results(stats, chains, OPTS.format, OPTS.outfile); } else { output_results(stats, chains, OPTS.format, OPTS.outfile); }