Better parser that can do a Set instead of just a list of URLs in a chain.

master
Zed A. Shaw 2 years ago
parent 503007149f
commit 366fc83bf2
  1. 238
      02-filter-a-log-file/chains_parser.js
  2. 18
      02-filter-a-log-file/package-lock.json
  3. 1
      02-filter-a-log-file/package.json

@ -9,13 +9,20 @@ class Parser {
this.next = ""; this.next = "";
} }
match(reg) { match(reg, consume=true) {
const n = this.next.match(reg); const n = this.next.match(reg);
const element = n.length > 1 ? n.slice(1) : n[0]; if(n === null) {
return undefined;
} else {
const element = n.length > 1 ? n.slice(1) : n[0];
if(consume) {
this.next = this.next.slice(n[0].length);
}
this.next = this.next.slice(n[0].length); return element;
return element; }
} }
start(line) { start(line) {
@ -23,122 +30,195 @@ class Parser {
this.next = line; this.next = line;
} }
ws() { ws(consume=true) {
this.match(/ +/); return this.match(/ +/, consume);
} }
parse() { parse_new_log(ip) {
try { const [port, conn_id, conn_count] = this.match(/^([0-9]+):([0-9]+):([0-9]+)/);
const [ip, port, conn_id, conn_count] = this.match(/([0-9.]+):([0-9]+):([0-9]+):([0-9]+)/);
this.ws(); this.ws();
const [ time ] = this.match(/\[(.*)\]/); const [ time ] = this.match(/\[(.*)\]/);
this.ws(); this.ws();
const [ url ] = this.match(/"(.+?)"/); const [ full_url ] = this.match(/"(.+?)"/);
this.ws(); const [url, params] = full_url.split("?");
const code = this.match(/\-|[0-9]+/); this.ws();
this.ws(); const code = this.match(/\-|[0-9]+/);
const bytes = this.match(/\-|[0-9]+/); this.ws();
this.ws(); const bytes = this.match(/\-|[0-9]+/);
const [refer] = this.match(/"(.+?)"/); this.ws();
this.ws(); const [refer] = this.match(/"(.+?)"/);
const [ua] = this.match(/"(.+?)"/); this.ws();
return { const [ua] = this.match(/"(.+?)"/);
ip,
conn_id: parseInt(conn_id), return {
conn_count: parseInt(conn_count), ip,
time: new Date(time), conn_id: parseInt(conn_id),
url, conn_count: parseInt(conn_count),
code: parseInt(code), time: new Date(time),
size: parseInt(bytes), url, params,
refer: refer === '-' ? undefined : refer, code: parseInt(code),
ua: UAParser(ua) size: parseInt(bytes),
}; refer: refer === '-' ? undefined : refer,
} catch(error) { ua: UAParser(ua)
throw new Error(`Parsing Error: ${ this.next }`); };
}
} }
}
const read_stream = fs.createReadStream(process.argv[2]); parse_old_log(ip) {
this.match(/- -/);
this.ws();
// FORMAT: 29/Mar/2022:22:40:52 +0200
const [ day, month, year, hour, minute, seconds, tz_offset ] = this.match(/\[([0-9]+)\/([A-Za-z]+)\/([0-9]+):([0-9]+):([0-9]+):([0-9]+) (.+?)\]/);
this.ws();
const rl = readline.createInterface({ const [ method, full_url, http_version ] = this.match(/"([A-Z]+) (.+) HTTP\/([0-9].[0-9])"/);
input: read_stream,
crlfDelay: Infinity
});
const parser = new Parser(); const [ url, params ] = full_url.split("?");
const chains = {};
const skip = /(authcheck\/?|.*.svg|.*.webmanifest|.*.js|.*.css|.*.png|.*.txt|.*.woff|.*.jpg|.*.mp4|.*.torrent|\-|.*.ico|\/api\/.*\?.*|.*.html|.*.map)$/ this.ws();
for await (let line of rl) { const code = this.match(/\-|[0-9]+/);
parser.start(line);
try { this.ws();
const data = parser.parse();
if(data.ua.os && data.code === 200 && !data.url.match(skip)) { const bytes = this.match(/\-|[0-9]+/);
let chain = data.ip in chains ? chains[data.ip] : [];
chain.push([data.time, data.url, data.refer]); this.ws();
chains[data.ip] = chain; const [refer] = this.match(/"(.+?)"/);
this.ws();
const [ua] = this.match(/"(.+?)"/);
// this is another IP address sometimes in another log format that I'll ignore
const unknown = this.match(/".+?"$/);
return {
ip,
method,
http_version,
time: new Date(`${day} ${month} ${year} ${hour}:${minute}:${seconds} ${tz_offset}`),
url, params,
code: parseInt(code),
size: parseInt(bytes),
refer: refer === '-' ? undefined : refer,
ua: UAParser(ua)
};
}
parse() {
const ip = this.match(/^[0-9\.]+/);
const test = this.match(/(:| )/);
// BUG: uhh for some reason it needs == here? === says : doesn't equal :
if(test == ":") {
return this.parse_new_log(ip);
} else if(test == " ") {
return this.parse_old_log(ip);
} else {
// console.log(`PARSE ERROR, expected : or ' ' but got ${test}`);
return {};
} }
} catch(err) {
if(line !== "") console.error(err);
} }
} }
const uniques = {}; const parse_logs = async (file_name) => {
const read_stream = fs.createReadStream(file_name);
for(let key in chains) { const rl = readline.createInterface({
const chain = chains[key]; input: read_stream,
crlfDelay: Infinity
});
const first = chain[0][2]; const parser = new Parser();
const urls = chain.map(s => { const stats = {
const ref = s[2]; lines: 0,
if(ref && ref !== first && !ref.includes("learnjsthehardway.com") ) { chains: 0,
return `${s[1]}(${s[2]})`; excluded: 0,
} else { errors: 0,
return s[1]; roots: 0,
firsts: 0
};
const skip = /(authcheck|.*\.svg|.*\.webmanifest|.*\.js|.*\.css|.*php|socket\.io|\.env|.*\.png|.*\.txt|.*\.woff|.*\.jpg|.*\.mp4|.*\.torrent|\-|.*\.ico|\/api\/.*\?.*|.*\.html|.*\.map|.*.php)/
const by_ip = {};
for await (let line of rl) {
try {
stats.lines += 1;
parser.start(line);
const data = parser.parse();
// skip lines that have content we don't care about
if(data.url.match(skip)) continue;
// store or update the chain in the by_ip chain
const ip_chain = by_ip[data.ip] || [];
ip_chain.push(data);
by_ip[data.ip] = ip_chain;
} catch(error) {
stats.errors += 1;
} }
}); }
return [by_ip, stats];
}
const chain_to_set = (requests) => {
const path = new Set();
for(let r of requests) {
path.add(r.url);
}
return path.values();
}
const sort_request_chains = (by_ip) => {
let ip_chains = {};
let seen; let seen;
let full = urls.filter((s, index, self) => { for(let [ip, requests] of Object.entries(by_ip)) {
if(s === seen) { const chain = chain_to_set(requests);
return false;
} else { const ref = requests[0].refer ? `[${requests[0].refer}]` : "";
seen = s; const url_set = [ref, ...chain].join(" ");
return true;
}
}).join(" ");
if(first) { ip_chains[url_set] = url_set in ip_chains ? ip_chains[url_set] + 1 : 1;
full = `[${first}] ${full}`;
} }
uniques[full] = full in uniques ? uniques[full] + 1 : 1; const chains_sorted = Object.entries(ip_chains);
chains_sorted.sort((a, b) => b[1] - a[1]);
return chains_sorted;
} }
const sorted = Object.entries(uniques); const [by_ip, stats] = await parse_logs(process.argv[2]);
sorted.sort((a, b) => b[1] - a[1]); const chains_sorted = sort_request_chains(by_ip);
for(let [url, count] of sorted) { for(let [url, count] of chains_sorted) {
console.log(count, url); console.log(count, url);
} }
console.log(stats);

@ -10,6 +10,7 @@
"license": "BSD", "license": "BSD",
"dependencies": { "dependencies": {
"ava": "^4.3.1", "ava": "^4.3.1",
"date-fns": "^2.29.1",
"ua-parser-js": "^1.0.2" "ua-parser-js": "^1.0.2"
} }
}, },
@ -500,6 +501,18 @@
"node": ">=0.10.0" "node": ">=0.10.0"
} }
}, },
"node_modules/date-fns": {
"version": "2.29.1",
"resolved": "https://registry.npmjs.org/date-fns/-/date-fns-2.29.1.tgz",
"integrity": "sha512-dlLD5rKaKxpFdnjrs+5azHDFOPEu4ANy/LTh04A1DTzMM7qoajmKCBc8pkKRFT41CNzw+4gQh79X5C+Jq27HAw==",
"engines": {
"node": ">=0.11"
},
"funding": {
"type": "opencollective",
"url": "https://opencollective.com/date-fns"
}
},
"node_modules/date-time": { "node_modules/date-time": {
"version": "3.1.0", "version": "3.1.0",
"resolved": "https://registry.npmjs.org/date-time/-/date-time-3.1.0.tgz", "resolved": "https://registry.npmjs.org/date-time/-/date-time-3.1.0.tgz",
@ -2164,6 +2177,11 @@
"array-find-index": "^1.0.1" "array-find-index": "^1.0.1"
} }
}, },
"date-fns": {
"version": "2.29.1",
"resolved": "https://registry.npmjs.org/date-fns/-/date-fns-2.29.1.tgz",
"integrity": "sha512-dlLD5rKaKxpFdnjrs+5azHDFOPEu4ANy/LTh04A1DTzMM7qoajmKCBc8pkKRFT41CNzw+4gQh79X5C+Jq27HAw=="
},
"date-time": { "date-time": {
"version": "3.1.0", "version": "3.1.0",
"resolved": "https://registry.npmjs.org/date-time/-/date-time-3.1.0.tgz", "resolved": "https://registry.npmjs.org/date-time/-/date-time-3.1.0.tgz",

@ -11,6 +11,7 @@
"license": "BSD", "license": "BSD",
"dependencies": { "dependencies": {
"ava": "^4.3.1", "ava": "^4.3.1",
"date-fns": "^2.29.1",
"ua-parser-js": "^1.0.2" "ua-parser-js": "^1.0.2"
} }
} }

Loading…
Cancel
Save