You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
28 lines
635 B
28 lines
635 B
import pdftotext
|
|
import sys
|
|
import re
|
|
|
|
# Load your PDF
|
|
with open(sys.argv[1], "rb") as f:
|
|
pdf = pdftotext.PDF(f)
|
|
|
|
assert len(pdf) == 1, "This report is more than 1 page!"
|
|
|
|
lines = "".join(pdf).split("\n")
|
|
|
|
numbers = re.compile(r"^[,\d\s]+$")
|
|
ignore = re.compile(r"^\s*$")
|
|
|
|
for line in lines:
|
|
if line == "\x0c" or ignore.match(line):
|
|
# ignore blank lines and trailing junk
|
|
continue
|
|
elif numbers.match(line):
|
|
# it's a number
|
|
try:
|
|
n = int(line.replace(',',''))
|
|
print(n)
|
|
except:
|
|
print("BAD", repr(line));
|
|
else:
|
|
print("TEXT", line)
|
|
|