Actually now it removes the list version and only doing sets, then it can glob many files to produce results.

master
Zed A. Shaw 2 years ago
parent 55dfed8179
commit b2c521b521
  1. 100
      02-filter-a-log-file/chains_parser.js
  2. 1
      02-filter-a-log-file/package-lock.json
  3. 1
      02-filter-a-log-file/package.json

@ -3,6 +3,23 @@ import assert from "assert";
import UAParser from "ua-parser-js";
import readline from "readline";
import { program } from "commander";
import glob from "fast-glob";
program
.option("--min <Number>", "The lowest count to print. Stop at this.", 1)
.option("--errors", "Show the erorrs so you can fix them.", false)
.option("--format <string>", "Output format, text or json. Ignores min for raw output.", "json")
.option("--outfile <string>", "Save to file rather than stdout.")
.requiredOption("--domain <String>", "Domain for the log. Gets removed as a refer.")
.requiredOption("--input <String>", "Input file glob.")
.description("Processes different web server logs to determine request chain frequency.")
.version(0.1);
program.parse();
const OPTS = program.opts();
OPTS.min = parseInt(OPTS.min);
class Parser {
constructor() {
@ -139,7 +156,7 @@ class Parser {
}
}
const parse_logs = async (file_name, errors) => {
const parse_log_file = async (results, stats, file_name, errors) => {
const read_stream = fs.createReadStream(file_name);
const rl = readline.createInterface({
@ -149,19 +166,8 @@ const parse_logs = async (file_name, errors) => {
const parser = new Parser();
const stats = {
lines: 0,
chains: 0,
excluded: 0,
errors: 0,
roots: 0,
firsts: 0
};
const skip = /(authcheck|.*\.svg|.*\.webmanifest|.*\.js|.*\.css|.*php|socket\.io|\.env|.*\.png|.*\.txt|.*\.woff|.*\.jpg|.*\.mp4|.*\.torrent|\-|.*\.ico|\/api\/.*\?.*|.*\.html|.*\.map|.*.php)/
const by_ip = {};
for await (let line of rl) {
try {
stats.lines += 1;
@ -173,40 +179,43 @@ const parse_logs = async (file_name, errors) => {
if(data.url.match(skip)) continue;
// store or update the chain in the by_ip chain
const ip_chain = by_ip[data.ip] || [];
const ip_chain = results[data.ip] || [];
ip_chain.push(data);
by_ip[data.ip] = ip_chain;
results[data.ip] = ip_chain;
} catch(error) {
if(errors) console.error(error);
stats.errors += 1;
}
}
return [by_ip, stats];
}
const chain_to_set = (requests) => {
const path = new Set();
const parse_logs_glob = async (file_glob, errors) => {
const file_list = glob.sync(file_glob);
const results = {};
const stats = {
lines: 0,
chains: 0,
excluded: 0,
errors: 0,
roots: 0,
firsts: 0
};
for(let r of requests) {
path.add(r.url);
for(let file_name of file_list) {
await parse_log_file(results, stats, file_name, errors);
}
return path.values();
return [results, stats];
}
const chain_to_list = (requests) => {
const path = [];
let seen;
const chain_to_set = (requests) => {
const path = new Set();
for(let r of requests) {
if(r.url != seen) {
path.push(r.url);
seen = r.url;
}
path.add(r.url);
}
return path.values();
@ -221,11 +230,11 @@ const construct_url_set = (domain, ref, full_chain) => {
}
}
const construct_request_chains = (by_ip, domain, as_set) => {
const construct_request_chains = (by_ip, domain) => {
let ip_chains = {};
for(let [ip, requests] of Object.entries(by_ip)) {
const chain = as_set ? chain_to_set(requests) : chain_to_list(requests);
const chain = chain_to_set(requests);
// record the initial refer to track entry to the site
const ref = requests[0].refer;
@ -294,33 +303,14 @@ const write_results = async (stats, chains, format, outfile) => {
fs.closeSync(fd);
}
program
.option("--no-set", "Use a Set instead of a list for chains.")
.option("--min <Number>", "The lowest count to print. Stop at this.", 1)
.option("--errors", "Show the erorrs so you can fix them.", false)
.option("--format <string>", "Output format, text or json. Ignores min for raw output.", "json")
.option("--outfile <string>", "Save to file rather than stdout.")
.requiredOption("--domain <String>", "Domain for the log. Gets removed as a refer.")
.requiredOption("--input <String>", "Input file.")
.description("Processes different web server logs to determine request chain frequency.")
.version(0.1);
program.parse();
const OPTS = program.opts();
OPTS.min = parseInt(OPTS.min);
assert(!isNaN(OPTS.min), `min must be a number, you have ${OPTS.min}`);
try {
const [by_ip, stats] = await parse_logs(OPTS.input, OPTS.errors);
const chains = construct_request_chains(by_ip, OPTS.domain, OPTS.set);
const [by_ip, stats] = await parse_logs_glob(OPTS.input, OPTS.errors);
const chains = construct_request_chains(by_ip, OPTS.domain);
if(OPTS.outfile) {
write_results(stats, chains, OPTS.format, OPTS.outfile);
} else {
output_results(stats, chains, OPTS.format, OPTS.outfile);
}
} catch(error) {
console.error(error.message);
process.exit(1);
if(OPTS.outfile) {
write_results(stats, chains, OPTS.format, OPTS.outfile);
} else {
output_results(stats, chains, OPTS.format, OPTS.outfile);
}

@ -12,6 +12,7 @@
"ava": "^4.3.1",
"commander": "^9.4.0",
"date-fns": "^2.29.1",
"fast-glob": "^3.2.11",
"ua-parser-js": "^1.0.2"
}
},

@ -13,6 +13,7 @@
"ava": "^4.3.1",
"commander": "^9.4.0",
"date-fns": "^2.29.1",
"fast-glob": "^3.2.11",
"ua-parser-js": "^1.0.2"
}
}

Loading…
Cancel
Save