#!/bin/python3 # Programme assez bordélique pour parser des tickets de caisse Super U # Nécessite une base de donnée SQLite avec la structure suivante : ''' CREATE TABLE "achats" ( "id" INTEGER, "associated_id" INTEGER, "quantite" INTEGER NOT NULL, "article" TEXT NOT NULL, "categorie" TEXT, "prix" INTEGER NOT NULL, "date" TEXT NOT NULL, "heure" TEXT NOT NULL, "ville" TEXT NOT NULL, PRIMARY KEY("id" AUTOINCREMENT) ); ''' import sys import os import PyPDF2 import sqlite3 def parsePDF(f): with open(f, 'rb') as f: reader = PyPDF2.PdfReader(f) contents = reader.getPage(0).extractText().split('\n') # Detecte si c'est un ticket de caisse ou ticket client if ("CARTE BANCAIRE" in contents[4]) or ("CARTE BANCAIRE" in contents[1]): print('Pattern found : don\'t look like a good ticket. Next one...\n') return '' else: print('No pattern found : seems to be a good one. Parsing it...') #print(contents) x = 0 for i in contents: print("[" + str(x) + "] " + contents[x]) x=x+1 if "===========" in contents[x]: print("matching with END pattern") return contents[3:x] elif "-----------" in contents[x]: print("matching with END pattern") return contents[3:x] else: pass def parseArticles(content): if len(content) == 0: return "" x = 0 for i in content: #print("[" + str(x) + "] " + content[x]) x=x+1 ville = content[2] info = content[12].split() date = content[12].split()[2] heure = content[12].split()[3] article = None z=-1 for line in content[14:]: z=z+1 print(line) if ">>>>" in line: categorie = line.replace('>', '')[2:] elif "Pourcentage" in line: continue elif not "€" in line: article = line.split(" ")[0][1:] continue elif "€" in line: # # Si article pas nul, c'est qu'on est sur un multi ligne # if article != None: print(line.split()) if (" x " in line) and ("€/kg" in line) and (" kg " in line): quantite = line.split()[0] prix = line.split()[3] + " €" if (" x " in line) and ("€" in line) and (not "€/kg" in line): quantite = line.split()[0] prix = line.split()[2] + " €" else: print("AAAAAAAAAA") print(line.split(" ")) # On à une ligne type poid avec ou sans prix... les relous quoi ! # Nom d'article, mais le reste est ligne du dessous # donc on sort de la boucle quand on à notre variable article = line.split(" ")[0][1:] quantite = 1 prix = line.split(" ")[len(line.split(" ")) - 2] print("on sort") # On pousse la requête ! if (article != None) and (prix != None) and (quantite != None): print("=> Date / Heure : " + date + " " + heure) print("=> Catégorie : " + categorie) print("=> Article : " + article) print("=> Quantité/poid : " + str(quantite)) print("=> Prix : " + prix) print("\n") sql = ''' INSERT INTO achats(quantite,article,categorie,prix,date,heure,ville) VALUES(?,?,?,?,?,?,?) ''' cur = conn.cursor() cur.execute(sql, (str(quantite), article, categorie, prix, date, heure, ville )) conn.commit() article = None prix = None quantite = None # Comme on est des cochons et que l'algo est trop mauvais, on peut avoir des erreurs de parsing # on va donc clean les entrées qui ont des valeurs nulles. # elif (article != None) and (prix == "") and (quantite != None): # continue def create_connection(db_file): conn = None try: conn = sqlite3.connect(db_file) except Error as e: print(e) return conn def ajout_article(conn, project): sql = ''' INSERT INTO achats(name,begin_date,end_date) VALUES(?,?,?) ''' cur = conn.cursor() cur.execute(sql, project) conn.commit() return cur.lastrowid if __name__ == '__main__': if len(sys.argv) != 2: print('Erreur') exit(1) print(f'Script name is {sys.argv[0]}') print(f'Path with PDF files is {sys.argv[1]}') # to store files in a list list = [] conn = create_connection("database") # dirs=directories for (root, dirs, file) in os.walk(sys.argv[1]): for f in file: if '.pdf' in f: print('=> Play with file : ' + f) parseArticles(parsePDF(sys.argv[1]+"/"+f)) conn.close() print("terminé")