Merge branch '173-do-not-use-inodes-as-part-of-a-file-s-fingerprint' into 'devel'

Resolve "Do not use inodes as part of a file's fingerprint"

Closes #173

See merge request wgp/dougal/software!19
This commit is contained in:
D. Berge
2022-02-07 16:08:04 +00:00
3 changed files with 101 additions and 13 deletions

View File

@@ -4,6 +4,7 @@ import psycopg2
import configuration import configuration
import preplots import preplots
import p111 import p111
from hashlib import md5 # Because it's good enough
""" """
Interface to the PostgreSQL database. Interface to the PostgreSQL database.
@@ -11,13 +12,16 @@ Interface to the PostgreSQL database.
def file_hash(file): def file_hash(file):
""" """
Calculate a file hash based on its size, inode, modification and creation times. Calculate a file hash based on its name, size, modification and creation times.
The hash is used to uniquely identify files in the database and detect if they The hash is used to uniquely identify files in the database and detect if they
have changed. have changed.
""" """
h = md5()
h.update(file.encode())
name_digest = h.hexdigest()[:16]
st = os.stat(file) st = os.stat(file)
return ":".join([str(v) for v in [st.st_size, st.st_mtime, st.st_ctime, st.st_ino]]) return ":".join([str(v) for v in [st.st_size, st.st_mtime, st.st_ctime, name_digest]])
class Datastore: class Datastore:
""" """

View File

@@ -0,0 +1,84 @@
-- Upgrade the database from commit 83be83e4 to 53ed096e.
--
-- New schema version: 0.2.0
--
-- ATTENTION:
--
-- ENSURE YOU HAVE BACKED UP THE DATABASE BEFORE RUNNING THIS SCRIPT.
--
--
-- NOTE: This upgrade affects all schemas in the database.
-- NOTE: Each application starts a transaction, which must be committed
-- or rolled back.
--
-- This migrates the file hashes to address issue #173.
-- The new hashes use size, modification time, creation time and the
-- first half of the MD5 hex digest of the file's absolute path.
--
-- It's a minor (rather than patch) version number increment because
-- changes to `bin/datastore.py` mean that the data is no longer
-- compatible with the hashing function.
--
-- To apply, run as the dougal user:
--
-- psql <<EOF
-- \i $THIS_FILE
-- COMMIT;
-- EOF
--
-- NOTE: It can take a while if run on a large database.
-- NOTE: It can be applied multiple times without ill effect.
BEGIN;
CREATE OR REPLACE PROCEDURE show_notice (notice text) AS $$
BEGIN
RAISE NOTICE '%', notice;
END;
$$ LANGUAGE plpgsql;
CREATE OR REPLACE PROCEDURE migrate_hashes (schema_name text) AS $$
BEGIN
RAISE NOTICE 'Migrating schema %', schema_name;
-- We need to set the search path because some of the trigger
-- functions reference other tables in survey schemas assuming
-- they are in the search path.
EXECUTE format('SET search_path TO %I,public', schema_name);
EXECUTE format('UPDATE %I.files SET hash = array_to_string(array_append(trim_array(string_to_array(hash, '':''), 1), left(md5(path), 16)), '':'')', schema_name);
EXECUTE 'SET search_path TO public'; -- Back to the default search path for good measure
END;
$$ LANGUAGE plpgsql;
CREATE OR REPLACE PROCEDURE upgrade_10 () AS $$
DECLARE
row RECORD;
BEGIN
FOR row IN
SELECT schema_name FROM information_schema.schemata
WHERE schema_name LIKE 'survey_%'
ORDER BY schema_name
LOOP
CALL migrate_hashes(row.schema_name);
END LOOP;
END;
$$ LANGUAGE plpgsql;
CALL upgrade_10();
CALL show_notice('Cleaning up');
DROP PROCEDURE migrate_hashes (schema_name text);
DROP PROCEDURE upgrade_10 ();
CALL show_notice('Updating db_schema version');
INSERT INTO public.info VALUES ('version', '{"db_schema": "0.2.0"}')
ON CONFLICT (key) DO UPDATE
SET value = public.info.value || '{"db_schema": "0.2.0"}' WHERE public.info.key = 'version';
CALL show_notice('All done. You may now run "COMMIT;" to persist the changes');
DROP PROCEDURE show_notice (notice text);
--
--NOTE Run `COMMIT;` now if all went well
--

View File

@@ -11,7 +11,7 @@
"license": "UNLICENSED", "license": "UNLICENSED",
"private": true, "private": true,
"config": { "config": {
"db_schema": "^0.1.0" "db_schema": "^0.2.0"
}, },
"engines": { "engines": {
"node": ">=14.0.0" "node": ">=14.0.0"