main.py 5.9 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190
  1. #!/bin/python3
  2. # Programme assez bordélique pour parser des tickets de caisse Super U
  3. # Nécessite une base de donnée SQLite avec la structure suivante :
  4. '''
  5. CREATE TABLE "achats" (
  6. "id" INTEGER,
  7. "associated_id" INTEGER,
  8. "quantite" INTEGER NOT NULL,
  9. "article" TEXT NOT NULL,
  10. "categorie" TEXT,
  11. "prix" INTEGER NOT NULL,
  12. "date" TEXT NOT NULL,
  13. "heure" TEXT NOT NULL,
  14. "ville" TEXT NOT NULL,
  15. PRIMARY KEY("id" AUTOINCREMENT)
  16. );
  17. tips :
  18. '''
  19. import os
  20. import PyPDF2
  21. import sqlite3
  22. import argparse
  23. parser = argparse.ArgumentParser(description='This program parse Super U tickets and put datas in a SQLite database.',
  24. epilog='Enjoy the program! :)')
  25. parser.add_argument('-d', '--db', help='SQLite database file', required=True)
  26. parser.add_argument('-p', '--path', help='Path to PDF files', required=True)
  27. parser.add_argument('-v', '--verbose', help='Verbose mode', action='store_true')
  28. args = parser.parse_args()
  29. if args.verbose:
  30. print('Verbose mode activated')
  31. print(args)
  32. def parsePDF(f):
  33. with open(f, 'rb') as f:
  34. if args.verbose:
  35. print('Opening file : ' + f.name)
  36. reader = PyPDF2.PdfReader(f)
  37. contents = reader.pages[0].extract_text().split('\n')
  38. # Detecte si c'est un ticket de caisse ou ticket client
  39. if ("CARTE BANCAIRE" in contents[4]) or ("CARTE BANCAIRE" in contents[1]):
  40. if args.verbose:
  41. print('Pattern found : don\'t look like a good ticket. Next one...\n')
  42. return ''
  43. else:
  44. if args.verbose:
  45. print('No pattern found : seems to be a good one. Parsing it...')
  46. print(contents)
  47. x = 0
  48. for i in contents:
  49. if args.verbose:
  50. print("[" + str(x) + "] " + contents[x])
  51. x=x+1
  52. if "===========" in contents[x]:
  53. if args.verbose:
  54. print("matching with END pattern")
  55. return contents[3:x]
  56. elif "-----------" in contents[x]:
  57. if args.verbose:
  58. print("matching with END pattern")
  59. return contents[3:x]
  60. else:
  61. pass
  62. def parseArticles(content, f):
  63. if len(content) == 0:
  64. return ""
  65. sql = ''' SELECT COUNT(*) FROM "main"."achats" WHERE "original_file" LIKE '%''' + f + '''%'; '''
  66. cur = conn.cursor()
  67. cur.execute(sql)
  68. conn.commit()
  69. if cur.fetchone()[0] != 0:
  70. print("Ticket déjà présent dans la base de donnée. Ignoré.")
  71. return ""
  72. x = 0
  73. for i in content:
  74. if args.verbose:
  75. print("[" + str(x) + "] " + content[x])
  76. x=x+1
  77. ville = content[2]
  78. info = content[12].split()
  79. date = content[12].split()[2]
  80. heure = content[12].split()[3]
  81. article = None
  82. z=-1
  83. for line in content[14:]:
  84. z=z+1
  85. if args.verbose:
  86. print(line)
  87. if ">>>>" in line:
  88. categorie = line.replace('>', '')[2:]
  89. elif "Pourcentage" in line:
  90. continue
  91. elif not "€" in line:
  92. article = line.split(" ")[0][1:]
  93. continue
  94. elif "€" in line:
  95. # Si article pas nul, c'est qu'on est sur un multi ligne
  96. if article != None:
  97. if args.verbose:
  98. print(line.split())
  99. if (" x " in line) and ("€/kg" in line) and (" kg " in line):
  100. quantite = line.split()[0]
  101. prix = line.split()[3] + " €"
  102. if (" x " in line) and ("€" in line) and (not "€/kg" in line):
  103. quantite = line.split()[0]
  104. prix = line.split()[2] + " €"
  105. else:
  106. if args.verbose:
  107. print(line.split(" "))
  108. # On à une ligne type poid avec ou sans prix... les relous quoi !
  109. # Nom d'article, mais le reste est ligne du dessous
  110. # donc on sort de la boucle quand on à notre variable
  111. article = line.split(" ")[0][1:]
  112. quantite = 1
  113. prix = line.split(" ")[len(line.split(" ")) - 2]
  114. if args.verbose:
  115. print("on sort")
  116. # On pousse la requête !
  117. if (article != None) and (prix != None) and (quantite != None):
  118. if args.verbose == True:
  119. print("=> Date / Heure : " + date + " " + heure)
  120. print("=> Catégorie : " + categorie)
  121. print("=> Article : " + article)
  122. print("=> Quantité/poid : " + str(quantite))
  123. print("=> Prix : " + prix)
  124. print("\n")
  125. sql = ''' INSERT INTO achats(quantite,article,categorie,prix,date,heure,ville,original_file)
  126. VALUES(?,?,?,?,?,?,?,?) '''
  127. cur = conn.cursor()
  128. cur.execute(sql, (str(quantite), article, categorie, prix, date, heure, ville, f))
  129. conn.commit()
  130. article = None
  131. prix = None
  132. quantite = None
  133. def create_connection(db_file):
  134. conn = None
  135. try:
  136. conn = sqlite3.connect(db_file)
  137. except Error as e:
  138. print(e)
  139. return conn
  140. def ajout_article(conn, project):
  141. sql = ''' INSERT INTO achats(name,begin_date,end_date)
  142. VALUES(?,?,?) '''
  143. cur = conn.cursor()
  144. cur.execute(sql, project)
  145. conn.commit()
  146. return cur.lastrowid
  147. if __name__ == '__main__':
  148. if args.verbose:
  149. print('Database filename : ', args.db)
  150. print('PDF folder : ', args.path)
  151. # to store files in a list
  152. list = []
  153. conn = create_connection(args.db)
  154. # dirs=directories
  155. for (root, dirs, file) in os.walk(args.path):
  156. for f in file:
  157. if '.pdf' in f:
  158. print('=> Play with file : ' + f)
  159. parseArticles(parsePDF(args.path+f), f)
  160. conn.close()
  161. print("terminé")