mirror of
https://gitlab.com/wgp/dougal/software.git
synced 2025-12-06 07:47:07 +00:00
Merge branch '173-do-not-use-inodes-as-part-of-a-file-s-fingerprint' into 'devel'
Resolve "Do not use inodes as part of a file's fingerprint" Closes #173 See merge request wgp/dougal/software!19
This commit is contained in:
@@ -4,6 +4,7 @@ import psycopg2
|
|||||||
import configuration
|
import configuration
|
||||||
import preplots
|
import preplots
|
||||||
import p111
|
import p111
|
||||||
|
from hashlib import md5 # Because it's good enough
|
||||||
|
|
||||||
"""
|
"""
|
||||||
Interface to the PostgreSQL database.
|
Interface to the PostgreSQL database.
|
||||||
@@ -11,13 +12,16 @@ Interface to the PostgreSQL database.
|
|||||||
|
|
||||||
def file_hash(file):
|
def file_hash(file):
|
||||||
"""
|
"""
|
||||||
Calculate a file hash based on its size, inode, modification and creation times.
|
Calculate a file hash based on its name, size, modification and creation times.
|
||||||
|
|
||||||
The hash is used to uniquely identify files in the database and detect if they
|
The hash is used to uniquely identify files in the database and detect if they
|
||||||
have changed.
|
have changed.
|
||||||
"""
|
"""
|
||||||
|
h = md5()
|
||||||
|
h.update(file.encode())
|
||||||
|
name_digest = h.hexdigest()[:16]
|
||||||
st = os.stat(file)
|
st = os.stat(file)
|
||||||
return ":".join([str(v) for v in [st.st_size, st.st_mtime, st.st_ctime, st.st_ino]])
|
return ":".join([str(v) for v in [st.st_size, st.st_mtime, st.st_ctime, name_digest]])
|
||||||
|
|
||||||
class Datastore:
|
class Datastore:
|
||||||
"""
|
"""
|
||||||
@@ -390,9 +394,9 @@ class Datastore:
|
|||||||
|
|
||||||
with self.conn.cursor() as cursor:
|
with self.conn.cursor() as cursor:
|
||||||
cursor.execute("BEGIN;")
|
cursor.execute("BEGIN;")
|
||||||
|
|
||||||
hash = self.add_file(filepath, cursor)
|
hash = self.add_file(filepath, cursor)
|
||||||
|
|
||||||
if not records or len(records) == 0:
|
if not records or len(records) == 0:
|
||||||
print("File has no records (or none have been detected)")
|
print("File has no records (or none have been detected)")
|
||||||
# We add the file to the database anyway to signal that we have
|
# We add the file to the database anyway to signal that we have
|
||||||
@@ -412,13 +416,13 @@ class Datastore:
|
|||||||
"""
|
"""
|
||||||
|
|
||||||
cursor.execute(qry, (fileinfo["sequence"], fileinfo["line"], ntbp, incr, json.dumps(fileinfo["meta"])))
|
cursor.execute(qry, (fileinfo["sequence"], fileinfo["line"], ntbp, incr, json.dumps(fileinfo["meta"])))
|
||||||
|
|
||||||
qry = """
|
qry = """
|
||||||
UPDATE raw_lines
|
UPDATE raw_lines
|
||||||
SET meta = meta || %s
|
SET meta = meta || %s
|
||||||
WHERE sequence = %s;
|
WHERE sequence = %s;
|
||||||
"""
|
"""
|
||||||
|
|
||||||
cursor.execute(qry, (json.dumps(fileinfo["meta"]), fileinfo["sequence"]))
|
cursor.execute(qry, (json.dumps(fileinfo["meta"]), fileinfo["sequence"]))
|
||||||
|
|
||||||
qry = """
|
qry = """
|
||||||
@@ -452,7 +456,7 @@ class Datastore:
|
|||||||
|
|
||||||
with self.conn.cursor() as cursor:
|
with self.conn.cursor() as cursor:
|
||||||
cursor.execute("BEGIN;")
|
cursor.execute("BEGIN;")
|
||||||
|
|
||||||
hash = self.add_file(filepath, cursor)
|
hash = self.add_file(filepath, cursor)
|
||||||
|
|
||||||
qry = """
|
qry = """
|
||||||
@@ -462,13 +466,13 @@ class Datastore:
|
|||||||
"""
|
"""
|
||||||
|
|
||||||
cursor.execute(qry, (fileinfo["sequence"], fileinfo["line"], json.dumps(fileinfo["meta"])))
|
cursor.execute(qry, (fileinfo["sequence"], fileinfo["line"], json.dumps(fileinfo["meta"])))
|
||||||
|
|
||||||
qry = """
|
qry = """
|
||||||
UPDATE raw_lines
|
UPDATE raw_lines
|
||||||
SET meta = meta || %s
|
SET meta = meta || %s
|
||||||
WHERE sequence = %s;
|
WHERE sequence = %s;
|
||||||
"""
|
"""
|
||||||
|
|
||||||
cursor.execute(qry, (json.dumps(fileinfo["meta"]), fileinfo["sequence"]))
|
cursor.execute(qry, (json.dumps(fileinfo["meta"]), fileinfo["sequence"]))
|
||||||
|
|
||||||
qry = """
|
qry = """
|
||||||
@@ -495,7 +499,7 @@ class Datastore:
|
|||||||
|
|
||||||
if filedata is not None:
|
if filedata is not None:
|
||||||
self.save_file_data(filepath, json.dumps(filedata), cursor)
|
self.save_file_data(filepath, json.dumps(filedata), cursor)
|
||||||
|
|
||||||
cursor.execute("CALL final_line_post_import(%s);", (fileinfo["sequence"],))
|
cursor.execute("CALL final_line_post_import(%s);", (fileinfo["sequence"],))
|
||||||
|
|
||||||
self.maybe_commit()
|
self.maybe_commit()
|
||||||
@@ -662,7 +666,7 @@ class Datastore:
|
|||||||
"""
|
"""
|
||||||
Remove final data for a sequence.
|
Remove final data for a sequence.
|
||||||
"""
|
"""
|
||||||
|
|
||||||
if cursor is None:
|
if cursor is None:
|
||||||
cur = self.conn.cursor()
|
cur = self.conn.cursor()
|
||||||
else:
|
else:
|
||||||
@@ -674,4 +678,4 @@ class Datastore:
|
|||||||
self.maybe_commit()
|
self.maybe_commit()
|
||||||
# We do not commit if we've been passed a cursor, instead
|
# We do not commit if we've been passed a cursor, instead
|
||||||
# we assume that we are in the middle of a transaction
|
# we assume that we are in the middle of a transaction
|
||||||
|
|
||||||
|
|||||||
84
etc/db/upgrades/upgrade10-83be83e4→53ed096e-v0.2.0.sql
Normal file
84
etc/db/upgrades/upgrade10-83be83e4→53ed096e-v0.2.0.sql
Normal file
@@ -0,0 +1,84 @@
|
|||||||
|
-- Upgrade the database from commit 83be83e4 to 53ed096e.
|
||||||
|
--
|
||||||
|
-- New schema version: 0.2.0
|
||||||
|
--
|
||||||
|
-- ATTENTION:
|
||||||
|
--
|
||||||
|
-- ENSURE YOU HAVE BACKED UP THE DATABASE BEFORE RUNNING THIS SCRIPT.
|
||||||
|
--
|
||||||
|
--
|
||||||
|
-- NOTE: This upgrade affects all schemas in the database.
|
||||||
|
-- NOTE: Each application starts a transaction, which must be committed
|
||||||
|
-- or rolled back.
|
||||||
|
--
|
||||||
|
-- This migrates the file hashes to address issue #173.
|
||||||
|
-- The new hashes use size, modification time, creation time and the
|
||||||
|
-- first half of the MD5 hex digest of the file's absolute path.
|
||||||
|
--
|
||||||
|
-- It's a minor (rather than patch) version number increment because
|
||||||
|
-- changes to `bin/datastore.py` mean that the data is no longer
|
||||||
|
-- compatible with the hashing function.
|
||||||
|
--
|
||||||
|
-- To apply, run as the dougal user:
|
||||||
|
--
|
||||||
|
-- psql <<EOF
|
||||||
|
-- \i $THIS_FILE
|
||||||
|
-- COMMIT;
|
||||||
|
-- EOF
|
||||||
|
--
|
||||||
|
-- NOTE: It can take a while if run on a large database.
|
||||||
|
-- NOTE: It can be applied multiple times without ill effect.
|
||||||
|
|
||||||
|
|
||||||
|
BEGIN;
|
||||||
|
|
||||||
|
CREATE OR REPLACE PROCEDURE show_notice (notice text) AS $$
|
||||||
|
BEGIN
|
||||||
|
RAISE NOTICE '%', notice;
|
||||||
|
END;
|
||||||
|
$$ LANGUAGE plpgsql;
|
||||||
|
|
||||||
|
CREATE OR REPLACE PROCEDURE migrate_hashes (schema_name text) AS $$
|
||||||
|
BEGIN
|
||||||
|
RAISE NOTICE 'Migrating schema %', schema_name;
|
||||||
|
-- We need to set the search path because some of the trigger
|
||||||
|
-- functions reference other tables in survey schemas assuming
|
||||||
|
-- they are in the search path.
|
||||||
|
EXECUTE format('SET search_path TO %I,public', schema_name);
|
||||||
|
EXECUTE format('UPDATE %I.files SET hash = array_to_string(array_append(trim_array(string_to_array(hash, '':''), 1), left(md5(path), 16)), '':'')', schema_name);
|
||||||
|
EXECUTE 'SET search_path TO public'; -- Back to the default search path for good measure
|
||||||
|
END;
|
||||||
|
$$ LANGUAGE plpgsql;
|
||||||
|
|
||||||
|
CREATE OR REPLACE PROCEDURE upgrade_10 () AS $$
|
||||||
|
DECLARE
|
||||||
|
row RECORD;
|
||||||
|
BEGIN
|
||||||
|
FOR row IN
|
||||||
|
SELECT schema_name FROM information_schema.schemata
|
||||||
|
WHERE schema_name LIKE 'survey_%'
|
||||||
|
ORDER BY schema_name
|
||||||
|
LOOP
|
||||||
|
CALL migrate_hashes(row.schema_name);
|
||||||
|
END LOOP;
|
||||||
|
END;
|
||||||
|
$$ LANGUAGE plpgsql;
|
||||||
|
|
||||||
|
CALL upgrade_10();
|
||||||
|
|
||||||
|
CALL show_notice('Cleaning up');
|
||||||
|
DROP PROCEDURE migrate_hashes (schema_name text);
|
||||||
|
DROP PROCEDURE upgrade_10 ();
|
||||||
|
|
||||||
|
CALL show_notice('Updating db_schema version');
|
||||||
|
INSERT INTO public.info VALUES ('version', '{"db_schema": "0.2.0"}')
|
||||||
|
ON CONFLICT (key) DO UPDATE
|
||||||
|
SET value = public.info.value || '{"db_schema": "0.2.0"}' WHERE public.info.key = 'version';
|
||||||
|
|
||||||
|
|
||||||
|
CALL show_notice('All done. You may now run "COMMIT;" to persist the changes');
|
||||||
|
DROP PROCEDURE show_notice (notice text);
|
||||||
|
|
||||||
|
--
|
||||||
|
--NOTE Run `COMMIT;` now if all went well
|
||||||
|
--
|
||||||
@@ -11,7 +11,7 @@
|
|||||||
"license": "UNLICENSED",
|
"license": "UNLICENSED",
|
||||||
"private": true,
|
"private": true,
|
||||||
"config": {
|
"config": {
|
||||||
"db_schema": "^0.1.0"
|
"db_schema": "^0.2.0"
|
||||||
},
|
},
|
||||||
"engines": {
|
"engines": {
|
||||||
"node": ">=14.0.0"
|
"node": ">=14.0.0"
|
||||||
|
|||||||
Reference in New Issue
Block a user