From 75f91a9553d189750ce1ec3ad0fb9f186dc11941 Mon Sep 17 00:00:00 2001 From: "D. Berge" Date: Mon, 7 Feb 2022 15:50:23 +0100 Subject: [PATCH 1/3] Increment schema wanted version --- lib/www/server/package.json | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/lib/www/server/package.json b/lib/www/server/package.json index dd473c0..8922f75 100644 --- a/lib/www/server/package.json +++ b/lib/www/server/package.json @@ -11,7 +11,7 @@ "license": "UNLICENSED", "private": true, "config": { - "db_schema": "^0.1.0" + "db_schema": "^0.2.0" }, "engines": { "node": ">=14.0.0" From 53ed096e1bb2813de601a8e739419747f54f27b7 Mon Sep 17 00:00:00 2001 From: "D. Berge" Date: Mon, 7 Feb 2022 15:51:38 +0100 Subject: [PATCH 2/3] Modify file hashing function. We remove the inode from the hash as it is unstable when the files are on an SMB filesystem, and replace it with an MD5 of the absolute file path. --- bin/datastore.py | 28 ++++++++++++++++------------ 1 file changed, 16 insertions(+), 12 deletions(-) diff --git a/bin/datastore.py b/bin/datastore.py index 91d4d08..151f06a 100644 --- a/bin/datastore.py +++ b/bin/datastore.py @@ -4,6 +4,7 @@ import psycopg2 import configuration import preplots import p111 +from hashlib import md5 # Because it's good enough """ Interface to the PostgreSQL database. @@ -11,13 +12,16 @@ Interface to the PostgreSQL database. def file_hash(file): """ - Calculate a file hash based on its size, inode, modification and creation times. + Calculate a file hash based on its name, size, modification and creation times. The hash is used to uniquely identify files in the database and detect if they have changed. """ + h = md5() + h.update(file.encode()) + name_digest = h.hexdigest()[:16] st = os.stat(file) - return ":".join([str(v) for v in [st.st_size, st.st_mtime, st.st_ctime, st.st_ino]]) + return ":".join([str(v) for v in [st.st_size, st.st_mtime, st.st_ctime, name_digest]]) class Datastore: """ @@ -390,9 +394,9 @@ class Datastore: with self.conn.cursor() as cursor: cursor.execute("BEGIN;") - + hash = self.add_file(filepath, cursor) - + if not records or len(records) == 0: print("File has no records (or none have been detected)") # We add the file to the database anyway to signal that we have @@ -412,13 +416,13 @@ class Datastore: """ cursor.execute(qry, (fileinfo["sequence"], fileinfo["line"], ntbp, incr, json.dumps(fileinfo["meta"]))) - + qry = """ UPDATE raw_lines SET meta = meta || %s WHERE sequence = %s; """ - + cursor.execute(qry, (json.dumps(fileinfo["meta"]), fileinfo["sequence"])) qry = """ @@ -452,7 +456,7 @@ class Datastore: with self.conn.cursor() as cursor: cursor.execute("BEGIN;") - + hash = self.add_file(filepath, cursor) qry = """ @@ -462,13 +466,13 @@ class Datastore: """ cursor.execute(qry, (fileinfo["sequence"], fileinfo["line"], json.dumps(fileinfo["meta"]))) - + qry = """ UPDATE raw_lines SET meta = meta || %s WHERE sequence = %s; """ - + cursor.execute(qry, (json.dumps(fileinfo["meta"]), fileinfo["sequence"])) qry = """ @@ -495,7 +499,7 @@ class Datastore: if filedata is not None: self.save_file_data(filepath, json.dumps(filedata), cursor) - + cursor.execute("CALL final_line_post_import(%s);", (fileinfo["sequence"],)) self.maybe_commit() @@ -662,7 +666,7 @@ class Datastore: """ Remove final data for a sequence. """ - + if cursor is None: cur = self.conn.cursor() else: @@ -674,4 +678,4 @@ class Datastore: self.maybe_commit() # We do not commit if we've been passed a cursor, instead # we assume that we are in the middle of a transaction - + From 0ca44c3861638efcdf038d607ff28eaca627d877 Mon Sep 17 00:00:00 2001 From: "D. Berge" Date: Mon, 7 Feb 2022 15:54:55 +0100 Subject: [PATCH 3/3] Add database upgrade file 10. NOTE: this is the first time we modify the actual data in the database, as opposed to adding to the schema. --- .../upgrade10-83be83e4→53ed096e-v0.2.0.sql | 84 +++++++++++++++++++ 1 file changed, 84 insertions(+) create mode 100644 etc/db/upgrades/upgrade10-83be83e4→53ed096e-v0.2.0.sql diff --git a/etc/db/upgrades/upgrade10-83be83e4→53ed096e-v0.2.0.sql b/etc/db/upgrades/upgrade10-83be83e4→53ed096e-v0.2.0.sql new file mode 100644 index 0000000..c767759 --- /dev/null +++ b/etc/db/upgrades/upgrade10-83be83e4→53ed096e-v0.2.0.sql @@ -0,0 +1,84 @@ +-- Upgrade the database from commit 83be83e4 to 53ed096e. +-- +-- New schema version: 0.2.0 +-- +-- ATTENTION: +-- +-- ENSURE YOU HAVE BACKED UP THE DATABASE BEFORE RUNNING THIS SCRIPT. +-- +-- +-- NOTE: This upgrade affects all schemas in the database. +-- NOTE: Each application starts a transaction, which must be committed +-- or rolled back. +-- +-- This migrates the file hashes to address issue #173. +-- The new hashes use size, modification time, creation time and the +-- first half of the MD5 hex digest of the file's absolute path. +-- +-- It's a minor (rather than patch) version number increment because +-- changes to `bin/datastore.py` mean that the data is no longer +-- compatible with the hashing function. +-- +-- To apply, run as the dougal user: +-- +-- psql <