From 7c0784ab860d5675d2cd42f168da4bef44142fcd Mon Sep 17 00:00:00 2001 From: Bob Carroll Date: Tue, 7 Nov 2023 18:02:43 -0600 Subject: [PATCH] initial commit --- analyze.py | 104 ++++++++++++++++++++++++++++++++++++++++++++++++++ concat.py | 52 +++++++++++++++++++++++++ parse.py | 109 +++++++++++++++++++++++++++++++++++++++++++++++++++++ 3 files changed, 265 insertions(+) create mode 100644 analyze.py create mode 100644 concat.py create mode 100644 parse.py diff --git a/analyze.py b/analyze.py new file mode 100644 index 0000000..e2fa625 --- /dev/null +++ b/analyze.py @@ -0,0 +1,104 @@ + +import sys +import json +import pprint +import gzip + + +def group_by_key(events, key_fn): + groups = {} + + for x in events: + key = key_fn(x) + if not key: + continue + + groups.setdefault(key, []) + groups[key].append(x) + + return groups + + +def count_by_key(events, key_fn): + unique = {} + + for x in events: + key = key_fn(x) + if not key: + continue + + unique.setdefault(key, 0) + unique[key] += 1 + + return unique + + +def sort_by_count(counts): + for k, v in reversed(sorted(counts.items(), key=lambda x: x[1])): + yield k, v + + +def get_msg(x): + return x['tags']['msg'][0] if 'msg' in x['tags'] else x['msg'] + + +def print_server_report(f, events): + f.write('\n\n### Events grouped by server ###\n') + groups = group_by_key(events, lambda x: x['headers']['server']) + + def key_fn(x): + rule = x['tags']['id'][0] + path = x['headers']['request']['path'] + msg = get_msg(x) + return f'{rule} {path} "{msg}"' + + for key, event in groups.items(): + f.write(f'\n# {key}\n') + counts = count_by_key(event, key_fn) + + for msg, count in sort_by_count(counts): + f.write(f'{msg} {count:,}\n') + + +def print_client_report(f, events): + f.write('\n\n### Events grouped by client ###\n') + groups = group_by_key(events, lambda x: x['headers'].get('client')) + + for key, event in groups.items(): + f.write(f'\n# {key}\n') + counts = count_by_key(event, get_msg) + + for reason, count in sort_by_count(counts): + f.write(f'"{reason}" {count:,}\n') + + +def print_ip_addr_report(f, events): + f.write('\n\n### Event counts by IP addresses ###\n\n') + counts = count_by_key(events, lambda x: x['headers'].get('client')) + + for ip_addr, count in sort_by_count(counts): + f.write(f'{ip_addr} {count:,}\n') + + +def print_reason_report(f, events): + f.write('\n\n### Event counts by block reason ###\n\n') + counts = count_by_key(events, get_msg) + + for reason, count in sort_by_count(counts): + f.write(f'"{reason}" {count:,}\n') + + +if __name__ == '__main__': + if len(sys.argv) != 3: + print('USAGE: python analyze.py ') + sys.exit(1) + + with gzip.open(sys.argv[1], 'r') as f: + events = json.loads(f.read()) + + with open(sys.argv[2], 'w') as f: + f.write('###### ModSecurity Blocked Events Report ######\n') + print_server_report(f, events) + print_client_report(f, events) + print_ip_addr_report(f, events) + print_reason_report(f, events) diff --git a/concat.py b/concat.py new file mode 100644 index 0000000..cdb1296 --- /dev/null +++ b/concat.py @@ -0,0 +1,52 @@ + +import os +import sys +import json +import gzip + + +def scan_dir(path): + return [os.path.join(path, x) for x in os.listdir(path) if x.endswith('.json.gz')] + + +def dedup(events, output): + count = 0 + + for x in events: + key = x['hash'] + + if key in output: + count += 1 + else: + output[key] = x + + return count + + +def concat_files(paths): + events = {} + dups = 0 + + for i, x in enumerate(paths): + with gzip.open(x, 'r') as f: + print(f'Reading {i+1}/{len(paths)} {x}') + dups += dedup(json.loads(f.read()), events) + + return list(events.values()), dups + + +def write_results(events, out_file, dups): + with gzip.open(out_file, 'w') as f: + f.write(json.dumps(events).encode('utf-8')) + + print(f'Wrote {len(events):,} events to {out_file}, Skipped {dups} duplicate(s)') + + +if __name__ == '__main__': + if len(sys.argv) != 3: + print('USAGE: python concat.py ') + sys.exit(1) + + paths = scan_dir(sys.argv[1]) + events, dups = concat_files(paths) + write_results(events, sys.argv[2] + '.json.gz', dups) diff --git a/parse.py b/parse.py new file mode 100644 index 0000000..d3e58fa --- /dev/null +++ b/parse.py @@ -0,0 +1,109 @@ + +import os +import sys +import re +import json +import hashlib +import gzip + +line_pattern = re.compile( + r'(\d{4}\/\d{2}\/\d{2}) (\d{2}:\d{2}:\d{2}) \[([A-Za-z]+)\] (\d+#\d+): ([*]\d+)( \[client [^]]+\])? ModSecurity: ([^[]+) (.*)') +headers_pattern = re.compile(r', ([^:]+): ([^,]+)') +request_pattern = re.compile(r'"([^ ]+) ([^ ]+) ([^ ]+)"') + + +def pivot_tags(tags): + results = {} + + for x in tags: + key, value = x + + if key in results: + results[key].append(value) + else: + results[key] = [value] + + return results + + +def parse_line(log_name, line): + m = line_pattern.match(line) + if not m: + return line + + fields = {'date': m[1], + 'time': m[2], + 'level': m[3], + 'process': m[4], + 'request': m[5], + 'msg': m[7], + 'tags': pivot_tags(re.findall(r'\[([^ ]+) "?([^"]+)"?\] ', m[8])), + 'headers': {k:v for k, v in headers_pattern.findall(m[8])}, + 'hash': hashlib.md5(line.encode('utf-8')).hexdigest(), + 'log_name': log_name} + + if 'request' in fields['headers']: + rm = request_pattern.findall(fields['headers']['request']) + fields['headers']['request'] = { + 'method': rm[0][0], + 'path': rm[0][1], + 'version': rm[0][2]} + + return fields + + +def read_log(path): + if path.endswith('.gz'): + f = gzip.open(path, 'rb') + else: + f = open(path, 'rb') + + lines = (x.decode('latin-1') for x in f.readlines()) + f.close() + + log_name = os.path.basename(path) + return [parse_line(log_name, x.strip()) for x in lines if 'ModSecurity' in x] + + +def partition_results(events): + parsed = [] + failed = [] + + for x in events: + if isinstance(x, str): + failed.append(x) + else: + parsed.append(x) + + return parsed, failed + + +def write_results(events, out_file, err_file): + parsed, failed = partition_results(events) + + with gzip.open(out_file, 'w') as f: + f.write(json.dumps(parsed).encode('utf-8')) + + if len(failed): + with open(err_file, 'w') as f: + for x in failed: + f.write(x + '\n') + + print(f'Wrote {out_file}, Parsed: {len(parsed)}, Failed: {len(failed)}') + + +if __name__ == '__main__': + if len(sys.argv) != 3: + print('USAGE: python parse.py ') + sys.exit(1) + + try: + os.mkdir(sys.argv[2]) + except FileExistsError: + pass + + out_file = os.path.join(sys.argv[2], sys.argv[1] + '.json.gz') + err_file = os.path.join(sys.argv[2], sys.argv[1] + '.failed') + + events = read_log(sys.argv[1]) + write_results(events, out_file, err_file)