mirror of
https://gitlab.com/wgp/dougal/software.git
synced 2025-12-06 06:37:07 +00:00
Merge branch '173-do-not-use-inodes-as-part-of-a-file-s-fingerprint' into 'devel'
Resolve "Do not use inodes as part of a file's fingerprint" Closes #173 See merge request wgp/dougal/software!19
This commit is contained in:
@@ -4,6 +4,7 @@ import psycopg2
|
||||
import configuration
|
||||
import preplots
|
||||
import p111
|
||||
from hashlib import md5 # Because it's good enough
|
||||
|
||||
"""
|
||||
Interface to the PostgreSQL database.
|
||||
@@ -11,13 +12,16 @@ Interface to the PostgreSQL database.
|
||||
|
||||
def file_hash(file):
|
||||
"""
|
||||
Calculate a file hash based on its size, inode, modification and creation times.
|
||||
Calculate a file hash based on its name, size, modification and creation times.
|
||||
|
||||
The hash is used to uniquely identify files in the database and detect if they
|
||||
have changed.
|
||||
"""
|
||||
h = md5()
|
||||
h.update(file.encode())
|
||||
name_digest = h.hexdigest()[:16]
|
||||
st = os.stat(file)
|
||||
return ":".join([str(v) for v in [st.st_size, st.st_mtime, st.st_ctime, st.st_ino]])
|
||||
return ":".join([str(v) for v in [st.st_size, st.st_mtime, st.st_ctime, name_digest]])
|
||||
|
||||
class Datastore:
|
||||
"""
|
||||
@@ -390,9 +394,9 @@ class Datastore:
|
||||
|
||||
with self.conn.cursor() as cursor:
|
||||
cursor.execute("BEGIN;")
|
||||
|
||||
|
||||
hash = self.add_file(filepath, cursor)
|
||||
|
||||
|
||||
if not records or len(records) == 0:
|
||||
print("File has no records (or none have been detected)")
|
||||
# We add the file to the database anyway to signal that we have
|
||||
@@ -412,13 +416,13 @@ class Datastore:
|
||||
"""
|
||||
|
||||
cursor.execute(qry, (fileinfo["sequence"], fileinfo["line"], ntbp, incr, json.dumps(fileinfo["meta"])))
|
||||
|
||||
|
||||
qry = """
|
||||
UPDATE raw_lines
|
||||
SET meta = meta || %s
|
||||
WHERE sequence = %s;
|
||||
"""
|
||||
|
||||
|
||||
cursor.execute(qry, (json.dumps(fileinfo["meta"]), fileinfo["sequence"]))
|
||||
|
||||
qry = """
|
||||
@@ -452,7 +456,7 @@ class Datastore:
|
||||
|
||||
with self.conn.cursor() as cursor:
|
||||
cursor.execute("BEGIN;")
|
||||
|
||||
|
||||
hash = self.add_file(filepath, cursor)
|
||||
|
||||
qry = """
|
||||
@@ -462,13 +466,13 @@ class Datastore:
|
||||
"""
|
||||
|
||||
cursor.execute(qry, (fileinfo["sequence"], fileinfo["line"], json.dumps(fileinfo["meta"])))
|
||||
|
||||
|
||||
qry = """
|
||||
UPDATE raw_lines
|
||||
SET meta = meta || %s
|
||||
WHERE sequence = %s;
|
||||
"""
|
||||
|
||||
|
||||
cursor.execute(qry, (json.dumps(fileinfo["meta"]), fileinfo["sequence"]))
|
||||
|
||||
qry = """
|
||||
@@ -495,7 +499,7 @@ class Datastore:
|
||||
|
||||
if filedata is not None:
|
||||
self.save_file_data(filepath, json.dumps(filedata), cursor)
|
||||
|
||||
|
||||
cursor.execute("CALL final_line_post_import(%s);", (fileinfo["sequence"],))
|
||||
|
||||
self.maybe_commit()
|
||||
@@ -662,7 +666,7 @@ class Datastore:
|
||||
"""
|
||||
Remove final data for a sequence.
|
||||
"""
|
||||
|
||||
|
||||
if cursor is None:
|
||||
cur = self.conn.cursor()
|
||||
else:
|
||||
@@ -674,4 +678,4 @@ class Datastore:
|
||||
self.maybe_commit()
|
||||
# We do not commit if we've been passed a cursor, instead
|
||||
# we assume that we are in the middle of a transaction
|
||||
|
||||
|
||||
|
||||
84
etc/db/upgrades/upgrade10-83be83e4→53ed096e-v0.2.0.sql
Normal file
84
etc/db/upgrades/upgrade10-83be83e4→53ed096e-v0.2.0.sql
Normal file
@@ -0,0 +1,84 @@
|
||||
-- Upgrade the database from commit 83be83e4 to 53ed096e.
|
||||
--
|
||||
-- New schema version: 0.2.0
|
||||
--
|
||||
-- ATTENTION:
|
||||
--
|
||||
-- ENSURE YOU HAVE BACKED UP THE DATABASE BEFORE RUNNING THIS SCRIPT.
|
||||
--
|
||||
--
|
||||
-- NOTE: This upgrade affects all schemas in the database.
|
||||
-- NOTE: Each application starts a transaction, which must be committed
|
||||
-- or rolled back.
|
||||
--
|
||||
-- This migrates the file hashes to address issue #173.
|
||||
-- The new hashes use size, modification time, creation time and the
|
||||
-- first half of the MD5 hex digest of the file's absolute path.
|
||||
--
|
||||
-- It's a minor (rather than patch) version number increment because
|
||||
-- changes to `bin/datastore.py` mean that the data is no longer
|
||||
-- compatible with the hashing function.
|
||||
--
|
||||
-- To apply, run as the dougal user:
|
||||
--
|
||||
-- psql <<EOF
|
||||
-- \i $THIS_FILE
|
||||
-- COMMIT;
|
||||
-- EOF
|
||||
--
|
||||
-- NOTE: It can take a while if run on a large database.
|
||||
-- NOTE: It can be applied multiple times without ill effect.
|
||||
|
||||
|
||||
BEGIN;
|
||||
|
||||
CREATE OR REPLACE PROCEDURE show_notice (notice text) AS $$
|
||||
BEGIN
|
||||
RAISE NOTICE '%', notice;
|
||||
END;
|
||||
$$ LANGUAGE plpgsql;
|
||||
|
||||
CREATE OR REPLACE PROCEDURE migrate_hashes (schema_name text) AS $$
|
||||
BEGIN
|
||||
RAISE NOTICE 'Migrating schema %', schema_name;
|
||||
-- We need to set the search path because some of the trigger
|
||||
-- functions reference other tables in survey schemas assuming
|
||||
-- they are in the search path.
|
||||
EXECUTE format('SET search_path TO %I,public', schema_name);
|
||||
EXECUTE format('UPDATE %I.files SET hash = array_to_string(array_append(trim_array(string_to_array(hash, '':''), 1), left(md5(path), 16)), '':'')', schema_name);
|
||||
EXECUTE 'SET search_path TO public'; -- Back to the default search path for good measure
|
||||
END;
|
||||
$$ LANGUAGE plpgsql;
|
||||
|
||||
CREATE OR REPLACE PROCEDURE upgrade_10 () AS $$
|
||||
DECLARE
|
||||
row RECORD;
|
||||
BEGIN
|
||||
FOR row IN
|
||||
SELECT schema_name FROM information_schema.schemata
|
||||
WHERE schema_name LIKE 'survey_%'
|
||||
ORDER BY schema_name
|
||||
LOOP
|
||||
CALL migrate_hashes(row.schema_name);
|
||||
END LOOP;
|
||||
END;
|
||||
$$ LANGUAGE plpgsql;
|
||||
|
||||
CALL upgrade_10();
|
||||
|
||||
CALL show_notice('Cleaning up');
|
||||
DROP PROCEDURE migrate_hashes (schema_name text);
|
||||
DROP PROCEDURE upgrade_10 ();
|
||||
|
||||
CALL show_notice('Updating db_schema version');
|
||||
INSERT INTO public.info VALUES ('version', '{"db_schema": "0.2.0"}')
|
||||
ON CONFLICT (key) DO UPDATE
|
||||
SET value = public.info.value || '{"db_schema": "0.2.0"}' WHERE public.info.key = 'version';
|
||||
|
||||
|
||||
CALL show_notice('All done. You may now run "COMMIT;" to persist the changes');
|
||||
DROP PROCEDURE show_notice (notice text);
|
||||
|
||||
--
|
||||
--NOTE Run `COMMIT;` now if all went well
|
||||
--
|
||||
@@ -11,7 +11,7 @@
|
||||
"license": "UNLICENSED",
|
||||
"private": true,
|
||||
"config": {
|
||||
"db_schema": "^0.1.0"
|
||||
"db_schema": "^0.2.0"
|
||||
},
|
||||
"engines": {
|
||||
"node": ">=14.0.0"
|
||||
|
||||
Reference in New Issue
Block a user