-
Notifications
You must be signed in to change notification settings - Fork 7
/
Copy pathprerechnung.py
302 lines (269 loc) · 10.9 KB
/
prerechnung.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
import os
import PySimpleGUI as sg
import utils
import project
import doc
import settings
import purchase_invoice
from purchase_invoice_google_parser import get_element_with_high_confidence
from api import Api, LIMIT
from itertools import groupby
import json
import traceback
import args
import company
import tempfile
from google.cloud import documentai_v1beta3 as documentai
from google.api_core.client_options import ClientOptions
ENTITIES_DATA_SCHEMA = {
"title": "Entities format",
"required": ["total_amount"],
"type": "object",
"properties": {
"supplier": {"type": "string"},
"supplier_address": {"type": "string"},
"bill_no": {"type": "string"},
"order_id": {"type": "string"},
"due_date": {"type": "string"},
"posting_date": {"type": "string"},
"ship_to_address": {"type": "string"},
"net_amount": {"type": "string"},
"total_tax_amount": {"type": "string"},
"total_amount": {"type": "string"},
"items": {
"type": "array",
"items": {
"required": ["item-description", "item-amount"],
"type": "object",
"properties": {
"item-pos": {"type": "string"},
"item-code": {"type": "string"},
"item-description": {"type": "string"},
"item-quantity": {"type": "string"},
"item-unit-price": {"type": "string"},
"item-amount": {"type": "string"},
}
},
"minItems": 1,
},
}
}
def process(company_name):
prs = Api.api.get_list(
"PreRechnung",
filters={'company': company_name, 'processed': False},
fields=['name','pdf'],
limit_page_length=LIMIT
)
for pr in prs:
process_inv(pr)
print("Prerechnungen vorprozessiert")
def extract_invoice_info(pdf_file_content) -> dict:
project_id = sg.UserSettings()['-google-credentials-']['project_id']
processor_id = sg.UserSettings()['-invoice-processor-']
location = "eu"
mime_type = "application/pdf"
# Instantiates a client
os.environ['GOOGLE_APPLICATION_CREDENTIALS'] = 'google-credentials.json'
# Set endpoint to EU
options = ClientOptions(api_endpoint="eu-documentai.googleapis.com")
# Instantiates a client
client = documentai.DocumentProcessorServiceClient(client_options=options)
# The full resource name of the processor, e.g.:
# projects/project-id/locations/location/processors/processor-id
name = f"projects/{project_id}/locations/{location}/processors/{processor_id}"
# Construct a document object
document = {"content": pdf_file_content, "mime_type": mime_type}
# Configure the process request
request = {"name": name, "document": document}
# Use the Document AI API to process the document
result = client.process_document(request=request)
print("Document processing complete.")
# Get all of the document text as one big string
document = result.document
# Read the text recognition output from the processor
text = document.text
# Get the entities from the document
sub_entities = []
for entity in document.entities:
props = []
for prop in entity.properties:
if prop.confidence >= 0.2:
props.append({
"value": prop.normalized_value.text or prop.text_anchor.content or prop.mention_text,
"type": prop.type_,
"confidence": prop.confidence,
})
# Sort the data by type and confidence in descending order
sorted_data = sorted(props, key=lambda x: (x['type'], -x['confidence']))
# Get the dictionaries with the highest confidence for each type
highest_confidence_props = []
for group, group_values in groupby(sorted_data, key=lambda x: x['type']):
highest_confidence_dict = max(group_values, key=lambda x: x['confidence'])
highest_confidence_props.append(highest_confidence_dict)
if entity.type_ == "item" or not entity.confidence or entity.confidence >= 0.2:
try:
line_no = entity.text_anchor.text_segments[0].start_index
except:
line_no = 0
sub_entities.append({
"value": entity.normalized_value.text or entity.text_anchor.content or entity.mention_text,
"type": entity.type_,
"properties": highest_confidence_props,
"confidence": entity.confidence,
"page_number": entity.page_anchor.page_refs[0].page,
"line_number": line_no,
})
sub_entities = sorted(sub_entities, key=lambda x: (x['page_number'], x['line_number']))
keys = []
entities = []
base_entity_info = {}
for sub_entity in sub_entities:
if sub_entity['type'] == 'item':
prop_types = [d['type'] for d in sub_entity['properties']]
if len(prop_types) == 0:
continue
if set(keys) & set(prop_types):
if len(base_entity_info['properties']) > 1:
entities.append(base_entity_info)
keys = []
base_entity_info = {}
keys.extend(prop_types)
if not base_entity_info.keys():
base_entity_info = sub_entity
else:
base_entity_info['properties'].extend(sub_entity['properties'])
else:
if base_entity_info.keys():
if len(base_entity_info['properties']) > 1:
entities.append(base_entity_info)
keys = []
base_entity_info = {}
entities.append(sub_entity)
if base_entity_info.keys() and len(base_entity_info['properties']) > 1:
entities.append(base_entity_info)
# Return the results as a dictionary
return {"document_text": text, "entities": entities}
def process_inv(pr):
"""
Preprocesses the pre invoice and updates the database with the extracted information.
Args:
pr: The pre invoice to be preprocessed
"""
print(pr['name'])
pdf = pr['pdf']
contents = Api.api.get_file(pdf)
if sg.UserSettings().get('-google-credentials-'):
# use Google invoice AI
try:
pr['json'] = json.dumps(extract_invoice_info(contents))
myjson = json.loads(pr['json'])
pr['auftragsnr'] = get_element_with_high_confidence(myjson, 'order_id')
pr['betrag'] = get_element_with_high_confidence(myjson, 'total_amount')
pr['processed'] = True
pr['doctype'] = 'PreRechnung'
Api.api.update(pr)
except Exception as e:
print(str(e)+"\n"+traceback.format_exc())
else:
# use local invoice parser
inv = purchase_invoice.PurchaseInvoice(pr['lager'])
tmpfile,tmpfilename = tempfile.mkstemp(suffix=".pdf")
with open(tmpfilename, "wb") as f:
f.write(contents)
try:
inv.parse_invoice(None, tmpfilename,
account=pr['buchungskonto'],
paid_by_submitter=pr['selbst_bezahlt'],
given_supplier=pr['lieferant'],
is_test=True)
except Exception as e:
print(e)
pass
try:
vat = sum(map(int, inv.vat.values()))
except:
vat = 0
if not inv.gross_total:
inv.gross_total = inv.total + vat
print("{} {} {}".format(pr['name'], inv.gross_total, inv.order_id))
if inv.gross_total:
pr['betrag'] = inv.gross_total
if inv.order_id:
pr['auftragsnr'] = inv.order_id
pr['processed'] = True
pr['doctype'] = 'PreRechnung'
Api.api.update(pr)
def to_pay(company_name):
prs = Api.api.get_list("PreRechnung", filters={'company': company_name,
'vom_konto_überwiesen': False,
'zu_zahlen_am': ['>', '01-01-1980']},
fields=['zu_zahlen_am','name','lieferant',
'auftragsnr','betrag','typ',
'datum','kommentar'],
limit_page_length=LIMIT)
prs.sort(key=lambda pr: pr['zu_zahlen_am'])
sum = 0.0
for pr in prs:
sum += pr['betrag']
pr['summe'] = sum
return prs
def read_and_transfer(inv, check_dup=True):
"""
Process a pre invoice and create an ERPNext invoice for it.
Args:
inv: the pre invoice
check_dup: check for duplicates, and if found, do not create a new purchase invoice
"""
if not inv['processed']:
process_inv(inv)
print("Lese ein {} {}:".format(inv['name'], inv['pdf']))
json_str = inv.get('json')
json_object = None
if json_str:
json_object = json.loads(json_str)
pdf = Api.api.get_file(inv['pdf'])
f = utils.store_temp_file(pdf, ".pdf")
utils.evince(f)
update_stock = 'chance' in inv and inv['chance'] and \
project.project_type(inv['chance']) in settings.STOCK_PROJECT_TYPES and \
inv.get('buchungskonto') in settings.STOCK_PRE_ACCOUNTS
pinv = purchase_invoice.PurchaseInvoice.read_and_transfer(
json_object, f, update_stock,
account_abbrv=inv.get('buchungskonto'), paid_by_submitter=inv.get('selbst_bezahlt', False),
project=inv.get('chance'), supplier=inv.get('lieferant'), check_dup=check_dup
)
if pinv and not inv.get('purchase_invoice'):
inv['eingepflegt'] = True
inv['purchase_invoice'] = pinv.doc['name']
inv_doc = doc.Doc(doc=inv, doctype='PreRechnung')
inv_doc.update()
if f:
try:
os.remove(f)
except Exception:
pass
return pinv
def read_and_transfer_pdf(file, update_stock = True, account=None, paid_by_submitter=False,project=None,supplier=None,check_dup=True):
"""
Process a PDF and directly create an ERPNext invoice for it (without using pre invoices).
Standalone function to be called from the command line.
Args:
file: the PDF file name
update_stock: update stock if True
account: default account (abbreviated)
paid_by_submitter: if the invoice was paid by the submitter
project: the project
supplier: the supplier
check_dup: check for duplicates, and if found, do not create a new purchase invoice
"""
args.init()
company.Company.init_companies()
with open(file,"rb") as f:
contents = f.read()
myjson = extract_invoice_info(contents)
pinv = purchase_invoice.PurchaseInvoice.read_and_transfer(
myjson, file, update_stock,
account_abbrv=account, paid_by_submitter=paid_by_submitter,
project=project, supplier=supplier, check_dup=check_dup)
return pinv