You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
295 lines
6.9 KiB
295 lines
6.9 KiB
import fs from "fs";
|
|
import assert from "assert";
|
|
import ua_parser from "ua-parser-js";
|
|
import readline from "readline";
|
|
import glob from "fast-glob";
|
|
import format from "date-fns/format/index.js";
|
|
|
|
export const description = "Processes different web server logs to determine request chain frequency."
|
|
|
|
export const options = [
|
|
["--min <Number>", "The lowest count to print. Stop at this.", 1],
|
|
["--errors", "Show the errors so you can fix them.", false]
|
|
]
|
|
|
|
export const required = [
|
|
["--output <String>", "Save to file rather than stdout."],
|
|
["--domain <String>", "Domain for the log. Gets removed as a refer."],
|
|
["--input <String>", "Input file glob."]
|
|
]
|
|
|
|
class Parser {
|
|
constructor() {
|
|
this.text = "";
|
|
this.next = "";
|
|
}
|
|
|
|
match(reg, consume=true) {
|
|
const n = this.next.match(reg);
|
|
|
|
if(n === null) {
|
|
return undefined;
|
|
} else {
|
|
const element = n.length > 1 ? n.slice(1) : n[0];
|
|
|
|
if(consume) {
|
|
this.next = this.next.slice(n[0].length);
|
|
}
|
|
|
|
return element;
|
|
}
|
|
}
|
|
|
|
start(line) {
|
|
this.text = line;
|
|
this.next = line;
|
|
}
|
|
|
|
ws(consume=true) {
|
|
return this.match(/ +/, consume);
|
|
}
|
|
|
|
parse_new_log(ip) {
|
|
const [port, conn_id, conn_count] = this.match(/^([0-9]+):([0-9]+):([0-9]+)/);
|
|
|
|
this.ws();
|
|
|
|
const [ time ] = this.match(/\[(.*)\]/);
|
|
|
|
this.ws();
|
|
|
|
const [ full_url ] = this.match(/"(.+?)"/);
|
|
|
|
const [url, params] = full_url.split("?");
|
|
|
|
this.ws();
|
|
|
|
const code = this.match(/\-|[0-9]+/);
|
|
|
|
this.ws();
|
|
|
|
const bytes = this.match(/\-|[0-9]+/);
|
|
|
|
this.ws();
|
|
|
|
const [refer] = this.match(/"(.+?)"/);
|
|
|
|
this.ws();
|
|
|
|
const [ua] = this.match(/"(.+?)"/);
|
|
|
|
return {
|
|
ip,
|
|
conn_id: parseInt(conn_id, 10),
|
|
conn_count: parseInt(conn_count, 10),
|
|
time: new Date(time),
|
|
url, params,
|
|
code: parseInt(code, 10),
|
|
size: parseInt(bytes, 10),
|
|
refer: refer === '-' ? undefined : refer,
|
|
ua: ua_parser(ua)
|
|
};
|
|
}
|
|
|
|
parse_old_log(ip) {
|
|
this.match(/- -/);
|
|
|
|
this.ws();
|
|
|
|
// FORMAT: 29/Mar/2022:22:40:52 +0200
|
|
const [ day, month, year, hour, minute, seconds, tz_offset ] = this.match(/\[([0-9]+)\/([A-Za-z]+)\/([0-9]+):([0-9]+):([0-9]+):([0-9]+) (.+?)\]/);
|
|
|
|
this.ws();
|
|
|
|
const [ method, full_url, http_version ] = this.match(/"([A-Z]+) (.+) HTTP\/([0-9].[0-9])"/);
|
|
|
|
const [ url, params ] = full_url.split("?");
|
|
|
|
this.ws();
|
|
|
|
const code = this.match(/\-|[0-9]+/);
|
|
|
|
this.ws();
|
|
|
|
const bytes = this.match(/\-|[0-9]+/);
|
|
|
|
this.ws();
|
|
|
|
const [refer] = this.match(/"(.+?)"/);
|
|
|
|
this.ws();
|
|
|
|
const [ua] = this.match(/"(.+?)"/);
|
|
|
|
// this is another IP address sometimes in another log format that I'll ignore
|
|
const unknown = this.match(/".+?"$/);
|
|
|
|
return {
|
|
ip,
|
|
method,
|
|
http_version,
|
|
time: new Date(`${day} ${month} ${year} ${hour}:${minute}:${seconds} ${tz_offset}`),
|
|
url, params,
|
|
code: parseInt(code, 10),
|
|
size: parseInt(bytes, 10),
|
|
refer: refer === '-' ? undefined : refer,
|
|
ua: ua_parser(ua)
|
|
};
|
|
}
|
|
|
|
parse() {
|
|
const ip = this.match(/^[0-9\.]+/);
|
|
const test = this.match(/(:| )/);
|
|
|
|
// BUG: uhh for some reason it needs == here? === says : doesn't equal :
|
|
if(test == ":") {
|
|
return this.parse_new_log(ip);
|
|
} else if(test == " ") {
|
|
return this.parse_old_log(ip);
|
|
} else {
|
|
// console.log(`PARSE ERROR, expected : or ' ' but got ${test}`);
|
|
return {};
|
|
}
|
|
}
|
|
}
|
|
|
|
const parse_log_file = async (results, stats, file_name, errors) => {
|
|
const read_stream = fs.createReadStream(file_name);
|
|
|
|
const rl = readline.createInterface({
|
|
input: read_stream,
|
|
crlfDelay: Infinity
|
|
});
|
|
|
|
const parser = new Parser();
|
|
|
|
const skip = /(authcheck|.*\.svg|.*\.webmanifest|.*\.js|.*\.css|.*php|socket\.io|\.env|.*\.png|.*\.txt|.*\.woff|.*\.jpg|.*\.mp4|.*\.torrent|\-|.*\.ico|\/api\/.*\?.*|.*\.html|.*\.map|.*.php)/
|
|
|
|
for await (let line of rl) {
|
|
try {
|
|
stats.lines += 1;
|
|
parser.start(line);
|
|
|
|
const data = parser.parse();
|
|
|
|
// skip lines that have content we don't care about
|
|
if(data.url.match(skip)) {
|
|
stats.excluded += 1;
|
|
continue;
|
|
}
|
|
|
|
// convert the date to a only day length
|
|
const date_key = format(data.time, "yyyy-MM-dd");
|
|
|
|
// get the date entry or a new one
|
|
const date_entry = results[date_key] || {};
|
|
|
|
// store or update the chain in the by_ip chain
|
|
const ip_chain = date_entry[data.ip] || [];
|
|
ip_chain.push(data);
|
|
date_entry[data.ip] = ip_chain;
|
|
|
|
// and update this date entry
|
|
results[date_key] = date_entry;
|
|
|
|
} catch(error) {
|
|
if(errors) console.error(error);
|
|
|
|
stats.errors += 1;
|
|
}
|
|
}
|
|
}
|
|
|
|
const parse_logs_glob = async (file_glob, errors) => {
|
|
const file_list = glob.sync(file_glob);
|
|
const results = {};
|
|
const stats = {
|
|
lines: 0,
|
|
excluded: 0,
|
|
errors: 0,
|
|
};
|
|
|
|
if(file_list.length === 0) {
|
|
console.error("!! NO FILES FOR GLOB", file_glob);
|
|
process.exit(1);
|
|
}
|
|
|
|
for(let file_name of file_list) {
|
|
console.log("PARSE", file_name);
|
|
await parse_log_file(results, stats, file_name, errors);
|
|
}
|
|
|
|
return [results, stats];
|
|
}
|
|
|
|
const chain_to_set = (requests) => {
|
|
const path = new Set();
|
|
|
|
for(let r of requests) {
|
|
path.add(r.url);
|
|
}
|
|
|
|
return path.values();
|
|
}
|
|
|
|
const construct_url_set = (domain, ref, full_chain) => {
|
|
// this tags the chains with refer using [ref]
|
|
if(ref && !ref.includes(domain)) {
|
|
return [`[${ref}]`, ...full_chain.slice(1)].join(" ");
|
|
} else {
|
|
return full_chain.join(" ");
|
|
}
|
|
}
|
|
|
|
const construct_request_chains = (by_ip, domain) => {
|
|
let ip_chains = {};
|
|
|
|
for(let [ip, requests] of Object.entries(by_ip)) {
|
|
const chain = chain_to_set(requests);
|
|
|
|
// record the initial refer to track entry to the site
|
|
const ref = requests[0].refer;
|
|
const full_chain = [...chain];
|
|
const url_set = construct_url_set(domain, ref, full_chain);
|
|
|
|
// using url as key to count,full_chain
|
|
if(url_set in ip_chains) {
|
|
ip_chains[url_set].count += 1;
|
|
} else {
|
|
ip_chains[url_set] = { count: 1, comes_from: ref, full_chain};
|
|
}
|
|
}
|
|
|
|
return ip_chains;
|
|
}
|
|
|
|
const output_json = async (stats, data, domain, min, outfile) => {
|
|
const result = {};
|
|
|
|
console.log("OUTPUT FILE", outfile);
|
|
|
|
for(let key in data) {
|
|
result[key] = construct_request_chains(data[key], domain);
|
|
}
|
|
|
|
const fd = fs.openSync(outfile, "w+");
|
|
const output = {stats, domain, result, generated_on: new Date()};
|
|
console.log("WRITE");
|
|
const bytes = fs.writeSync(fd, Buffer.from(JSON.stringify(output)), 0);
|
|
assert(bytes > 0, `failed to write to ${outfile}, wrote ${bytes} in length`);
|
|
|
|
fs.closeSync(fd);
|
|
}
|
|
|
|
export const main = async (OPTS) => {
|
|
OPTS.min = parseInt(OPTS.min, 10);
|
|
assert(!isNaN(OPTS.min), `min must be a number, you have ${OPTS.min}`);
|
|
|
|
const [by_date, stats] = await parse_logs_glob(OPTS.input, OPTS.errors);
|
|
|
|
console.log("STATS", stats);
|
|
|
|
await output_json(stats, by_date, OPTS.domain, OPTS.min, OPTS.output);
|
|
|
|
process.exit(0);
|
|
}
|
|
|