#!/bin/python3 # Programme assez bordélique pour parser des tickets de caisse Super U # Nécessite une base de donnée SQLite avec la structure suivante : ''' CREATE TABLE "achats" ( "id" INTEGER, "associated_id" INTEGER, "quantite" INTEGER NOT NULL, "article" TEXT NOT NULL, "categorie" TEXT, "prix" INTEGER NOT NULL, "date" TEXT NOT NULL, "heure" TEXT NOT NULL, "ville" TEXT NOT NULL, PRIMARY KEY("id" AUTOINCREMENT) ); tips : ''' import os import PyPDF2 import sqlite3 import argparse parser = argparse.ArgumentParser(description='This program parse Super U tickets and put datas in a SQLite database.', epilog='Enjoy the program! :)') parser.add_argument('-d', '--db', help='SQLite database file', required=True) parser.add_argument('-p', '--path', help='Path to PDF files', required=True) parser.add_argument('-v', '--verbose', help='Verbose mode', action='store_true') args = parser.parse_args() if args.verbose: print('Verbose mode activated') print(args) def parsePDF(f): with open(f, 'rb') as f: if args.verbose: print('Opening file : ' + f.name) reader = PyPDF2.PdfReader(f) contents = reader.pages[0].extract_text().split('\n') # Detecte si c'est un ticket de caisse ou ticket client if ("CARTE BANCAIRE" in contents[4]) or ("CARTE BANCAIRE" in contents[1]): if args.verbose: print('Pattern found : don\'t look like a good ticket. Next one...\n') return '' else: if args.verbose: print('No pattern found : seems to be a good one. Parsing it...') print(contents) x = 0 for i in contents: if args.verbose: print("[" + str(x) + "] " + contents[x]) x=x+1 if "===========" in contents[x]: if args.verbose: print("matching with END pattern") return contents[3:x] elif "-----------" in contents[x]: if args.verbose: print("matching with END pattern") return contents[3:x] else: pass def parseArticles(content, f): if len(content) == 0: return "" sql = ''' SELECT COUNT(*) FROM "main"."achats" WHERE "original_file" LIKE '%''' + f + '''%'; ''' cur = conn.cursor() cur.execute(sql) conn.commit() if cur.fetchone()[0] != 0: print("Ticket déjà présent dans la base de donnée. Ignoré.") return "" x = 0 for i in content: if args.verbose: print("[" + str(x) + "] " + content[x]) x=x+1 ville = content[2] info = content[12].split() date = content[12].split()[2] heure = content[12].split()[3] article = None z=-1 for line in content[14:]: z=z+1 if args.verbose: print(line) if ">>>>" in line: categorie = line.replace('>', '')[2:] elif "Pourcentage" in line: continue elif not "€" in line: article = line.split(" ")[0][1:] continue elif "€" in line: # Si article pas nul, c'est qu'on est sur un multi ligne if article != None: if args.verbose: print(line.split()) if (" x " in line) and ("€/kg" in line) and (" kg " in line): quantite = line.split()[0] prix = line.split()[3] + " €" if (" x " in line) and ("€" in line) and (not "€/kg" in line): quantite = line.split()[0] prix = line.split()[2] + " €" else: if args.verbose: print(line.split(" ")) # On à une ligne type poid avec ou sans prix... les relous quoi ! # Nom d'article, mais le reste est ligne du dessous # donc on sort de la boucle quand on à notre variable article = line.split(" ")[0][1:] quantite = 1 prix = line.split(" ")[len(line.split(" ")) - 2] if args.verbose: print("on sort") # On pousse la requête ! if (article != None) and (prix != None) and (quantite != None): if args.verbose == True: print("=> Date / Heure : " + date + " " + heure) print("=> Catégorie : " + categorie) print("=> Article : " + article) print("=> Quantité/poid : " + str(quantite)) print("=> Prix : " + prix) print("\n") sql = ''' INSERT INTO achats(quantite,article,categorie,prix,date,heure,ville,original_file) VALUES(?,?,?,?,?,?,?,?) ''' cur = conn.cursor() cur.execute(sql, (str(quantite), article, categorie, prix, date, heure, ville, f)) conn.commit() article = None prix = None quantite = None def create_connection(db_file): conn = None try: conn = sqlite3.connect(db_file) except Error as e: print(e) return conn def ajout_article(conn, project): sql = ''' INSERT INTO achats(name,begin_date,end_date) VALUES(?,?,?) ''' cur = conn.cursor() cur.execute(sql, project) conn.commit() return cur.lastrowid if __name__ == '__main__': if args.verbose: print('Database filename : ', args.db) print('PDF folder : ', args.path) # to store files in a list list = [] conn = create_connection(args.db) # dirs=directories for (root, dirs, file) in os.walk(args.path): for f in file: if '.pdf' in f: print('=> Play with file : ' + f) parseArticles(parsePDF(args.path+f), f) conn.close() print("terminé")