Next version of the parser will use only sets, which turns out to work almost the same as maintaining a list but is faster and doesn't care about order of the events coming in.

master
Zed A. Shaw 2 years ago
parent cf527da326
commit afe3cfca47
  1. 58
      02-filter-a-log-file/chains_parser.js

@ -212,27 +212,53 @@ const chain_to_list = (requests) => {
return path.values(); return path.values();
} }
const sort_request_chains = (by_ip, as_set) => { const construct_url_set = (domain, ref, full_chain) => {
// this tags the chains with refer using [ref]
if(ref && !ref.includes(domain)) {
return [`[${ref}]`, ...full_chain.slice(1)].join(" ");
} else {
return full_chain.join(" ");
}
}
const construct_request_chains = (by_ip, domain, as_set) => {
let ip_chains = {}; let ip_chains = {};
let seen;
for(let [ip, requests] of Object.entries(by_ip)) { for(let [ip, requests] of Object.entries(by_ip)) {
const chain = as_set ? chain_to_set(requests) : chain_to_list(requests); const chain = as_set ? chain_to_set(requests) : chain_to_list(requests);
const ref = requests[0].refer ? `[${requests[0].refer}]` : ""; // record the initial refer to track entry to the site
const url_set = [ref, ...chain].join(" "); const ref = requests[0].refer;
const full_chain = [...chain];
const url_set = construct_url_set(domain, ref, full_chain);
ip_chains[url_set] = url_set in ip_chains ? ip_chains[url_set] + 1 : 1; // using url as key to count,full_chain
if(url_set in ip_chains) {
ip_chains[url_set].count += 1;
} else {
ip_chains[url_set] = { count: 1, comes_from: ref, full_chain};
}
} }
const chains_sorted = Object.entries(ip_chains); return ip_chains;
chains_sorted.sort((a, b) => b[1] - a[1]); }
return chains_sorted; const sort_request_chains = (chains) => {
const converted = [];
for(let [url, stats] of Object.entries(chains)) {
if(stats.comes_from) {
converted.push([stats.count, `[${stats.comes_from}] ${stats.full_chain.join(' ')}`]);
} else {
converted.push([stats.count, `${stats.full_chain.join(' ')}`]);
}
}
return converted.sort((a, b) => b[0] - a[0]);
} }
const output_results = (min, chains_sorted) => { const output_results = (chains_sorted, min) => {
for(let [url, count] of chains_sorted) { for(let [count, url] of chains_sorted) {
if(count >= min) { if(count >= min) {
console.log(count, url); console.log(count, url);
} }
@ -245,6 +271,8 @@ program
.option("--no-set", "Use a Set instead of a list for chains.") .option("--no-set", "Use a Set instead of a list for chains.")
.option("--min <Number>", "The lowest count to print. Stop at this.", 1) .option("--min <Number>", "The lowest count to print. Stop at this.", 1)
.option("--errors", "Show the erorrs so you can fix them.", false) .option("--errors", "Show the erorrs so you can fix them.", false)
.option("--format <string>", "Output format, text or json. Ignores min for raw output.", "json")
.requiredOption("--domain <String>", "Domain for the log. Gets removed as a refer.")
.requiredOption("--input <String>", "Input file.") .requiredOption("--input <String>", "Input file.")
.description("Processes different web server logs to determine request chain frequency.") .description("Processes different web server logs to determine request chain frequency.")
.version(0.1); .version(0.1);
@ -256,5 +284,11 @@ OPTS.min = parseInt(OPTS.min);
assert(!isNaN(OPTS.min), `min must be a number, you have ${OPTS.min}`); assert(!isNaN(OPTS.min), `min must be a number, you have ${OPTS.min}`);
const [by_ip, stats] = await parse_logs(OPTS.input, OPTS.errors); const [by_ip, stats] = await parse_logs(OPTS.input, OPTS.errors);
const chains_sorted = sort_request_chains(by_ip, OPTS.set); const chains = construct_request_chains(by_ip, OPTS.domain, OPTS.set);
output_results(OPTS.min, chains_sorted); const chains_sorted = sort_request_chains(chains);
if(OPTS.format === "json") {
console.log(chains);
} else {
output_results(chains_sorted, OPTS.min);
}

Loading…
Cancel
Save