diff --git a/bin/datastore.py b/bin/datastore.py index 91d4d08..151f06a 100644 --- a/bin/datastore.py +++ b/bin/datastore.py @@ -4,6 +4,7 @@ import psycopg2 import configuration import preplots import p111 +from hashlib import md5 # Because it's good enough """ Interface to the PostgreSQL database. @@ -11,13 +12,16 @@ Interface to the PostgreSQL database. def file_hash(file): """ - Calculate a file hash based on its size, inode, modification and creation times. + Calculate a file hash based on its name, size, modification and creation times. The hash is used to uniquely identify files in the database and detect if they have changed. """ + h = md5() + h.update(file.encode()) + name_digest = h.hexdigest()[:16] st = os.stat(file) - return ":".join([str(v) for v in [st.st_size, st.st_mtime, st.st_ctime, st.st_ino]]) + return ":".join([str(v) for v in [st.st_size, st.st_mtime, st.st_ctime, name_digest]]) class Datastore: """ @@ -390,9 +394,9 @@ class Datastore: with self.conn.cursor() as cursor: cursor.execute("BEGIN;") - + hash = self.add_file(filepath, cursor) - + if not records or len(records) == 0: print("File has no records (or none have been detected)") # We add the file to the database anyway to signal that we have @@ -412,13 +416,13 @@ class Datastore: """ cursor.execute(qry, (fileinfo["sequence"], fileinfo["line"], ntbp, incr, json.dumps(fileinfo["meta"]))) - + qry = """ UPDATE raw_lines SET meta = meta || %s WHERE sequence = %s; """ - + cursor.execute(qry, (json.dumps(fileinfo["meta"]), fileinfo["sequence"])) qry = """ @@ -452,7 +456,7 @@ class Datastore: with self.conn.cursor() as cursor: cursor.execute("BEGIN;") - + hash = self.add_file(filepath, cursor) qry = """ @@ -462,13 +466,13 @@ class Datastore: """ cursor.execute(qry, (fileinfo["sequence"], fileinfo["line"], json.dumps(fileinfo["meta"]))) - + qry = """ UPDATE raw_lines SET meta = meta || %s WHERE sequence = %s; """ - + cursor.execute(qry, (json.dumps(fileinfo["meta"]), fileinfo["sequence"])) qry = """ @@ -495,7 +499,7 @@ class Datastore: if filedata is not None: self.save_file_data(filepath, json.dumps(filedata), cursor) - + cursor.execute("CALL final_line_post_import(%s);", (fileinfo["sequence"],)) self.maybe_commit() @@ -662,7 +666,7 @@ class Datastore: """ Remove final data for a sequence. """ - + if cursor is None: cur = self.conn.cursor() else: @@ -674,4 +678,4 @@ class Datastore: self.maybe_commit() # We do not commit if we've been passed a cursor, instead # we assume that we are in the middle of a transaction - + diff --git a/etc/db/upgrades/upgrade10-83be83e4→53ed096e-v0.2.0.sql b/etc/db/upgrades/upgrade10-83be83e4→53ed096e-v0.2.0.sql new file mode 100644 index 0000000..c767759 --- /dev/null +++ b/etc/db/upgrades/upgrade10-83be83e4→53ed096e-v0.2.0.sql @@ -0,0 +1,84 @@ +-- Upgrade the database from commit 83be83e4 to 53ed096e. +-- +-- New schema version: 0.2.0 +-- +-- ATTENTION: +-- +-- ENSURE YOU HAVE BACKED UP THE DATABASE BEFORE RUNNING THIS SCRIPT. +-- +-- +-- NOTE: This upgrade affects all schemas in the database. +-- NOTE: Each application starts a transaction, which must be committed +-- or rolled back. +-- +-- This migrates the file hashes to address issue #173. +-- The new hashes use size, modification time, creation time and the +-- first half of the MD5 hex digest of the file's absolute path. +-- +-- It's a minor (rather than patch) version number increment because +-- changes to `bin/datastore.py` mean that the data is no longer +-- compatible with the hashing function. +-- +-- To apply, run as the dougal user: +-- +-- psql <=14.0.0"