import pdftotext import sys import re # Load your PDF with open(sys.argv[1], "rb") as f: pdf = pdftotext.PDF(f) assert len(pdf) == 1, "This report is more than 1 page!" lines = "".join(pdf).split("\n") numbers = re.compile(r"^[,\d\s]+$") ignore = re.compile(r"^\s*$") for line in lines: if line == "\x0c" or ignore.match(line): # ignore blank lines and trailing junk continue elif numbers.match(line): # it's a number try: n = int(line.replace(',','')) print(n) except: print("BAD", repr(line)); else: print("TEXT", line)