These are the projects for the JavaScript Level 2 module in Learn JS the Hard Way.
You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
 
 
 

316 lines
7.5 KiB

import fs from "fs";
import assert from "assert";
import UAParser from "ua-parser-js";
import readline from "readline";
import { program } from "commander";
import glob from "fast-glob";
program
.option("--min <Number>", "The lowest count to print. Stop at this.", 1)
.option("--errors", "Show the erorrs so you can fix them.", false)
.option("--format <string>", "Output format, text or json. Ignores min for raw output.", "json")
.option("--outfile <string>", "Save to file rather than stdout.")
.requiredOption("--domain <String>", "Domain for the log. Gets removed as a refer.")
.requiredOption("--input <String>", "Input file glob.")
.description("Processes different web server logs to determine request chain frequency.")
.version(0.1);
program.parse();
const OPTS = program.opts();
OPTS.min = parseInt(OPTS.min);
class Parser {
constructor() {
this.text = "";
this.next = "";
}
match(reg, consume=true) {
const n = this.next.match(reg);
if(n === null) {
return undefined;
} else {
const element = n.length > 1 ? n.slice(1) : n[0];
if(consume) {
this.next = this.next.slice(n[0].length);
}
return element;
}
}
start(line) {
this.text = line;
this.next = line;
}
ws(consume=true) {
return this.match(/ +/, consume);
}
parse_new_log(ip) {
const [port, conn_id, conn_count] = this.match(/^([0-9]+):([0-9]+):([0-9]+)/);
this.ws();
const [ time ] = this.match(/\[(.*)\]/);
this.ws();
const [ full_url ] = this.match(/"(.+?)"/);
const [url, params] = full_url.split("?");
this.ws();
const code = this.match(/\-|[0-9]+/);
this.ws();
const bytes = this.match(/\-|[0-9]+/);
this.ws();
const [refer] = this.match(/"(.+?)"/);
this.ws();
const [ua] = this.match(/"(.+?)"/);
return {
ip,
conn_id: parseInt(conn_id),
conn_count: parseInt(conn_count),
time: new Date(time),
url, params,
code: parseInt(code),
size: parseInt(bytes),
refer: refer === '-' ? undefined : refer,
ua: UAParser(ua)
};
}
parse_old_log(ip) {
this.match(/- -/);
this.ws();
// FORMAT: 29/Mar/2022:22:40:52 +0200
const [ day, month, year, hour, minute, seconds, tz_offset ] = this.match(/\[([0-9]+)\/([A-Za-z]+)\/([0-9]+):([0-9]+):([0-9]+):([0-9]+) (.+?)\]/);
this.ws();
const [ method, full_url, http_version ] = this.match(/"([A-Z]+) (.+) HTTP\/([0-9].[0-9])"/);
const [ url, params ] = full_url.split("?");
this.ws();
const code = this.match(/\-|[0-9]+/);
this.ws();
const bytes = this.match(/\-|[0-9]+/);
this.ws();
const [refer] = this.match(/"(.+?)"/);
this.ws();
const [ua] = this.match(/"(.+?)"/);
// this is another IP address sometimes in another log format that I'll ignore
const unknown = this.match(/".+?"$/);
return {
ip,
method,
http_version,
time: new Date(`${day} ${month} ${year} ${hour}:${minute}:${seconds} ${tz_offset}`),
url, params,
code: parseInt(code),
size: parseInt(bytes),
refer: refer === '-' ? undefined : refer,
ua: UAParser(ua)
};
}
parse() {
const ip = this.match(/^[0-9\.]+/);
const test = this.match(/(:| )/);
// BUG: uhh for some reason it needs == here? === says : doesn't equal :
if(test == ":") {
return this.parse_new_log(ip);
} else if(test == " ") {
return this.parse_old_log(ip);
} else {
// console.log(`PARSE ERROR, expected : or ' ' but got ${test}`);
return {};
}
}
}
const parse_log_file = async (results, stats, file_name, errors) => {
const read_stream = fs.createReadStream(file_name);
const rl = readline.createInterface({
input: read_stream,
crlfDelay: Infinity
});
const parser = new Parser();
const skip = /(authcheck|.*\.svg|.*\.webmanifest|.*\.js|.*\.css|.*php|socket\.io|\.env|.*\.png|.*\.txt|.*\.woff|.*\.jpg|.*\.mp4|.*\.torrent|\-|.*\.ico|\/api\/.*\?.*|.*\.html|.*\.map|.*.php)/
for await (let line of rl) {
try {
stats.lines += 1;
parser.start(line);
const data = parser.parse();
// skip lines that have content we don't care about
if(data.url.match(skip)) continue;
// store or update the chain in the by_ip chain
const ip_chain = results[data.ip] || [];
ip_chain.push(data);
results[data.ip] = ip_chain;
} catch(error) {
if(errors) console.error(error);
stats.errors += 1;
}
}
}
const parse_logs_glob = async (file_glob, errors) => {
const file_list = glob.sync(file_glob);
const results = {};
const stats = {
lines: 0,
chains: 0,
excluded: 0,
errors: 0,
roots: 0,
firsts: 0
};
for(let file_name of file_list) {
await parse_log_file(results, stats, file_name, errors);
}
return [results, stats];
}
const chain_to_set = (requests) => {
const path = new Set();
for(let r of requests) {
path.add(r.url);
}
return path.values();
}
const construct_url_set = (domain, ref, full_chain) => {
// this tags the chains with refer using [ref]
if(ref && !ref.includes(domain)) {
return [`[${ref}]`, ...full_chain.slice(1)].join(" ");
} else {
return full_chain.join(" ");
}
}
const construct_request_chains = (by_ip, domain) => {
let ip_chains = {};
for(let [ip, requests] of Object.entries(by_ip)) {
const chain = chain_to_set(requests);
// record the initial refer to track entry to the site
const ref = requests[0].refer;
const full_chain = [...chain];
const url_set = construct_url_set(domain, ref, full_chain);
// using url as key to count,full_chain
if(url_set in ip_chains) {
ip_chains[url_set].count += 1;
} else {
ip_chains[url_set] = { count: 1, comes_from: ref, full_chain};
}
}
return ip_chains;
}
const sort_request_chains = (chains, min) => {
const converted = [];
for(let [url, stats] of Object.entries(chains)) {
if(stats.count < min) continue; // skip below min
if(stats.comes_from) {
converted.push([stats.count, `[${stats.comes_from}] ${stats.full_chain.join(' ')}`]);
} else {
converted.push([stats.count, `${stats.full_chain.join(' ')}`]);
}
}
return converted.sort((a, b) => b[0] - a[0]);
}
const output_results = async (stats, chains, format, outfile) => {
if(format === "json") {
const data = {stats, chains, date: new Date()};
console.log(data);
} else {
const chains_sorted = sort_request_chains(chains, OPTS.min);
for(let [count, url] of chains_sorted) {
console.log(count, url);
}
console.log(stats);
}
}
const write_results = async (stats, chains, format, outfile) => {
assert(outfile, "Output file required.");
// unlike unix APIs this uses exceptions rather than return values for errors
const fd = fs.openSync(outfile, "w+");
if(format === "json") {
const data = {stats, chains, date: new Date()};
const bytes = fs.writeSync(fd, Buffer.from(JSON.stringify(data, null, 4)), 0);
} else {
const chains_sorted = sort_request_chains(chains, OPTS.min);
for(let [count, url] of chains_sorted) {
const bytes = fs.writeSync(fd, Buffer.from(`${count} ${url}\n`));
}
}
fs.closeSync(fd);
}
assert(!isNaN(OPTS.min), `min must be a number, you have ${OPTS.min}`);
const [by_ip, stats] = await parse_logs_glob(OPTS.input, OPTS.errors);
const chains = construct_request_chains(by_ip, OPTS.domain);
if(OPTS.outfile) {
write_results(stats, chains, OPTS.format, OPTS.outfile);
} else {
output_results(stats, chains, OPTS.format, OPTS.outfile);
}