123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190 |
- #!/bin/python3
- # Programme assez bordélique pour parser des tickets de caisse Super U
- # Nécessite une base de donnée SQLite avec la structure suivante :
- '''
- CREATE TABLE "achats" (
- "id" INTEGER,
- "associated_id" INTEGER,
- "quantite" INTEGER NOT NULL,
- "article" TEXT NOT NULL,
- "categorie" TEXT,
- "prix" INTEGER NOT NULL,
- "date" TEXT NOT NULL,
- "heure" TEXT NOT NULL,
- "ville" TEXT NOT NULL,
- PRIMARY KEY("id" AUTOINCREMENT)
- );
- tips :
- '''
- import os
- import PyPDF2
- import sqlite3
- import argparse
- parser = argparse.ArgumentParser(description='This program parse Super U tickets and put datas in a SQLite database.',
- epilog='Enjoy the program! :)')
- parser.add_argument('-d', '--db', help='SQLite database file', required=True)
- parser.add_argument('-p', '--path', help='Path to PDF files', required=True)
- parser.add_argument('-v', '--verbose', help='Verbose mode', action='store_true')
- args = parser.parse_args()
- if args.verbose:
- print('Verbose mode activated')
- print(args)
- def parsePDF(f):
- with open(f, 'rb') as f:
- if args.verbose:
- print('Opening file : ' + f.name)
- reader = PyPDF2.PdfReader(f)
- contents = reader.pages[0].extract_text().split('\n')
- # Detecte si c'est un ticket de caisse ou ticket client
- if ("CARTE BANCAIRE" in contents[4]) or ("CARTE BANCAIRE" in contents[1]):
- if args.verbose:
- print('Pattern found : don\'t look like a good ticket. Next one...\n')
- return ''
- else:
- if args.verbose:
- print('No pattern found : seems to be a good one. Parsing it...')
- print(contents)
- x = 0
- for i in contents:
- if args.verbose:
- print("[" + str(x) + "] " + contents[x])
- x=x+1
- if "===========" in contents[x]:
- if args.verbose:
- print("matching with END pattern")
- return contents[3:x]
- elif "-----------" in contents[x]:
- if args.verbose:
- print("matching with END pattern")
- return contents[3:x]
- else:
- pass
- def parseArticles(content, f):
- if len(content) == 0:
- return ""
- sql = ''' SELECT COUNT(*) FROM "main"."achats" WHERE "original_file" LIKE '%''' + f + '''%'; '''
- cur = conn.cursor()
- cur.execute(sql)
- conn.commit()
- if cur.fetchone()[0] != 0:
- print("Ticket déjà présent dans la base de donnée. Ignoré.")
- return ""
- x = 0
- for i in content:
- if args.verbose:
- print("[" + str(x) + "] " + content[x])
- x=x+1
- ville = content[2]
- info = content[12].split()
- date = content[12].split()[2]
- heure = content[12].split()[3]
- article = None
- z=-1
- for line in content[14:]:
- z=z+1
- if args.verbose:
- print(line)
- if ">>>>" in line:
- categorie = line.replace('>', '')[2:]
- elif "Pourcentage" in line:
- continue
- elif not "€" in line:
- article = line.split(" ")[0][1:]
- continue
- elif "€" in line:
- # Si article pas nul, c'est qu'on est sur un multi ligne
- if article != None:
- if args.verbose:
- print(line.split())
- if (" x " in line) and ("€/kg" in line) and (" kg " in line):
- quantite = line.split()[0]
- prix = line.split()[3] + " €"
- if (" x " in line) and ("€" in line) and (not "€/kg" in line):
- quantite = line.split()[0]
- prix = line.split()[2] + " €"
- else:
- if args.verbose:
- print(line.split(" "))
- # On à une ligne type poid avec ou sans prix... les relous quoi !
- # Nom d'article, mais le reste est ligne du dessous
- # donc on sort de la boucle quand on à notre variable
- article = line.split(" ")[0][1:]
- quantite = 1
- prix = line.split(" ")[len(line.split(" ")) - 2]
- if args.verbose:
- print("on sort")
- # On pousse la requête !
- if (article != None) and (prix != None) and (quantite != None):
- if args.verbose == True:
- print("=> Date / Heure : " + date + " " + heure)
- print("=> Catégorie : " + categorie)
- print("=> Article : " + article)
- print("=> Quantité/poid : " + str(quantite))
- print("=> Prix : " + prix)
- print("\n")
- sql = ''' INSERT INTO achats(quantite,article,categorie,prix,date,heure,ville,original_file)
- VALUES(?,?,?,?,?,?,?,?) '''
- cur = conn.cursor()
- cur.execute(sql, (str(quantite), article, categorie, prix, date, heure, ville, f))
- conn.commit()
- article = None
- prix = None
- quantite = None
- def create_connection(db_file):
- conn = None
- try:
- conn = sqlite3.connect(db_file)
- except Error as e:
- print(e)
- return conn
- def ajout_article(conn, project):
- sql = ''' INSERT INTO achats(name,begin_date,end_date)
- VALUES(?,?,?) '''
- cur = conn.cursor()
- cur.execute(sql, project)
- conn.commit()
- return cur.lastrowid
- if __name__ == '__main__':
- if args.verbose:
- print('Database filename : ', args.db)
- print('PDF folder : ', args.path)
- # to store files in a list
- list = []
- conn = create_connection(args.db)
- # dirs=directories
- for (root, dirs, file) in os.walk(args.path):
- for f in file:
- if '.pdf' in f:
- print('=> Play with file : ' + f)
- parseArticles(parsePDF(args.path+f), f)
- conn.close()
- print("terminé")
|