Compare commits

..

5 Commits

Author SHA1 Message Date
D. Berge
195741a768 Merge branch '173-do-not-use-inodes-as-part-of-a-file-s-fingerprint' into 'devel'
Resolve "Do not use inodes as part of a file's fingerprint"

Closes #173

See merge request wgp/dougal/software!19
2022-02-07 16:08:04 +00:00
D. Berge
0ca44c3861 Add database upgrade file 10.
NOTE: this is the first time we modify the actual data
in the database, as opposed to adding to the schema.
2022-02-07 17:05:19 +01:00
D. Berge
53ed096e1b Modify file hashing function.
We remove the inode from the hash as it is unstable when the
files are on an SMB filesystem, and replace it with an MD5
of the absolute file path.
2022-02-07 17:03:10 +01:00
D. Berge
75f91a9553 Increment schema wanted version 2022-02-07 17:02:59 +01:00
D. Berge
40b07c9169 Merge branch '175-add-database-versioning-and-migration-mechanism' into 'devel'
Resolve "Add database versioning and migration mechanism"

Closes #175

See merge request wgp/dougal/software!18
2022-02-07 14:43:50 +00:00
3 changed files with 101 additions and 13 deletions

View File

@@ -4,6 +4,7 @@ import psycopg2
import configuration
import preplots
import p111
from hashlib import md5 # Because it's good enough
"""
Interface to the PostgreSQL database.
@@ -11,13 +12,16 @@ Interface to the PostgreSQL database.
def file_hash(file):
"""
Calculate a file hash based on its size, inode, modification and creation times.
Calculate a file hash based on its name, size, modification and creation times.
The hash is used to uniquely identify files in the database and detect if they
have changed.
"""
h = md5()
h.update(file.encode())
name_digest = h.hexdigest()[:16]
st = os.stat(file)
return ":".join([str(v) for v in [st.st_size, st.st_mtime, st.st_ctime, st.st_ino]])
return ":".join([str(v) for v in [st.st_size, st.st_mtime, st.st_ctime, name_digest]])
class Datastore:
"""
@@ -390,9 +394,9 @@ class Datastore:
with self.conn.cursor() as cursor:
cursor.execute("BEGIN;")
hash = self.add_file(filepath, cursor)
if not records or len(records) == 0:
print("File has no records (or none have been detected)")
# We add the file to the database anyway to signal that we have
@@ -412,13 +416,13 @@ class Datastore:
"""
cursor.execute(qry, (fileinfo["sequence"], fileinfo["line"], ntbp, incr, json.dumps(fileinfo["meta"])))
qry = """
UPDATE raw_lines
SET meta = meta || %s
WHERE sequence = %s;
"""
cursor.execute(qry, (json.dumps(fileinfo["meta"]), fileinfo["sequence"]))
qry = """
@@ -452,7 +456,7 @@ class Datastore:
with self.conn.cursor() as cursor:
cursor.execute("BEGIN;")
hash = self.add_file(filepath, cursor)
qry = """
@@ -462,13 +466,13 @@ class Datastore:
"""
cursor.execute(qry, (fileinfo["sequence"], fileinfo["line"], json.dumps(fileinfo["meta"])))
qry = """
UPDATE raw_lines
SET meta = meta || %s
WHERE sequence = %s;
"""
cursor.execute(qry, (json.dumps(fileinfo["meta"]), fileinfo["sequence"]))
qry = """
@@ -495,7 +499,7 @@ class Datastore:
if filedata is not None:
self.save_file_data(filepath, json.dumps(filedata), cursor)
cursor.execute("CALL final_line_post_import(%s);", (fileinfo["sequence"],))
self.maybe_commit()
@@ -662,7 +666,7 @@ class Datastore:
"""
Remove final data for a sequence.
"""
if cursor is None:
cur = self.conn.cursor()
else:
@@ -674,4 +678,4 @@ class Datastore:
self.maybe_commit()
# We do not commit if we've been passed a cursor, instead
# we assume that we are in the middle of a transaction

View File

@@ -0,0 +1,84 @@
-- Upgrade the database from commit 83be83e4 to 53ed096e.
--
-- New schema version: 0.2.0
--
-- ATTENTION:
--
-- ENSURE YOU HAVE BACKED UP THE DATABASE BEFORE RUNNING THIS SCRIPT.
--
--
-- NOTE: This upgrade affects all schemas in the database.
-- NOTE: Each application starts a transaction, which must be committed
-- or rolled back.
--
-- This migrates the file hashes to address issue #173.
-- The new hashes use size, modification time, creation time and the
-- first half of the MD5 hex digest of the file's absolute path.
--
-- It's a minor (rather than patch) version number increment because
-- changes to `bin/datastore.py` mean that the data is no longer
-- compatible with the hashing function.
--
-- To apply, run as the dougal user:
--
-- psql <<EOF
-- \i $THIS_FILE
-- COMMIT;
-- EOF
--
-- NOTE: It can take a while if run on a large database.
-- NOTE: It can be applied multiple times without ill effect.
BEGIN;
CREATE OR REPLACE PROCEDURE show_notice (notice text) AS $$
BEGIN
RAISE NOTICE '%', notice;
END;
$$ LANGUAGE plpgsql;
CREATE OR REPLACE PROCEDURE migrate_hashes (schema_name text) AS $$
BEGIN
RAISE NOTICE 'Migrating schema %', schema_name;
-- We need to set the search path because some of the trigger
-- functions reference other tables in survey schemas assuming
-- they are in the search path.
EXECUTE format('SET search_path TO %I,public', schema_name);
EXECUTE format('UPDATE %I.files SET hash = array_to_string(array_append(trim_array(string_to_array(hash, '':''), 1), left(md5(path), 16)), '':'')', schema_name);
EXECUTE 'SET search_path TO public'; -- Back to the default search path for good measure
END;
$$ LANGUAGE plpgsql;
CREATE OR REPLACE PROCEDURE upgrade_10 () AS $$
DECLARE
row RECORD;
BEGIN
FOR row IN
SELECT schema_name FROM information_schema.schemata
WHERE schema_name LIKE 'survey_%'
ORDER BY schema_name
LOOP
CALL migrate_hashes(row.schema_name);
END LOOP;
END;
$$ LANGUAGE plpgsql;
CALL upgrade_10();
CALL show_notice('Cleaning up');
DROP PROCEDURE migrate_hashes (schema_name text);
DROP PROCEDURE upgrade_10 ();
CALL show_notice('Updating db_schema version');
INSERT INTO public.info VALUES ('version', '{"db_schema": "0.2.0"}')
ON CONFLICT (key) DO UPDATE
SET value = public.info.value || '{"db_schema": "0.2.0"}' WHERE public.info.key = 'version';
CALL show_notice('All done. You may now run "COMMIT;" to persist the changes');
DROP PROCEDURE show_notice (notice text);
--
--NOTE Run `COMMIT;` now if all went well
--

View File

@@ -11,7 +11,7 @@
"license": "UNLICENSED",
"private": true,
"config": {
"db_schema": "^0.1.0"
"db_schema": "^0.2.0"
},
"engines": {
"node": ">=14.0.0"