Merge branch '173-do-not-use-inodes-as-part-of-a-file-s-fingerprint' into 'devel'

Resolve "Do not use inodes as part of a file's fingerprint"

Closes #173

See merge request wgp/dougal/software!19
This commit is contained in:
D. Berge
2022-02-07 16:08:04 +00:00
3 changed files with 101 additions and 13 deletions

View File

@@ -4,6 +4,7 @@ import psycopg2
import configuration
import preplots
import p111
from hashlib import md5 # Because it's good enough
"""
Interface to the PostgreSQL database.
@@ -11,13 +12,16 @@ Interface to the PostgreSQL database.
def file_hash(file):
"""
Calculate a file hash based on its size, inode, modification and creation times.
Calculate a file hash based on its name, size, modification and creation times.
The hash is used to uniquely identify files in the database and detect if they
have changed.
"""
h = md5()
h.update(file.encode())
name_digest = h.hexdigest()[:16]
st = os.stat(file)
return ":".join([str(v) for v in [st.st_size, st.st_mtime, st.st_ctime, st.st_ino]])
return ":".join([str(v) for v in [st.st_size, st.st_mtime, st.st_ctime, name_digest]])
class Datastore:
"""

View File

@@ -0,0 +1,84 @@
-- Upgrade the database from commit 83be83e4 to 53ed096e.
--
-- New schema version: 0.2.0
--
-- ATTENTION:
--
-- ENSURE YOU HAVE BACKED UP THE DATABASE BEFORE RUNNING THIS SCRIPT.
--
--
-- NOTE: This upgrade affects all schemas in the database.
-- NOTE: Each application starts a transaction, which must be committed
-- or rolled back.
--
-- This migrates the file hashes to address issue #173.
-- The new hashes use size, modification time, creation time and the
-- first half of the MD5 hex digest of the file's absolute path.
--
-- It's a minor (rather than patch) version number increment because
-- changes to `bin/datastore.py` mean that the data is no longer
-- compatible with the hashing function.
--
-- To apply, run as the dougal user:
--
-- psql <<EOF
-- \i $THIS_FILE
-- COMMIT;
-- EOF
--
-- NOTE: It can take a while if run on a large database.
-- NOTE: It can be applied multiple times without ill effect.
BEGIN;
CREATE OR REPLACE PROCEDURE show_notice (notice text) AS $$
BEGIN
RAISE NOTICE '%', notice;
END;
$$ LANGUAGE plpgsql;
CREATE OR REPLACE PROCEDURE migrate_hashes (schema_name text) AS $$
BEGIN
RAISE NOTICE 'Migrating schema %', schema_name;
-- We need to set the search path because some of the trigger
-- functions reference other tables in survey schemas assuming
-- they are in the search path.
EXECUTE format('SET search_path TO %I,public', schema_name);
EXECUTE format('UPDATE %I.files SET hash = array_to_string(array_append(trim_array(string_to_array(hash, '':''), 1), left(md5(path), 16)), '':'')', schema_name);
EXECUTE 'SET search_path TO public'; -- Back to the default search path for good measure
END;
$$ LANGUAGE plpgsql;
CREATE OR REPLACE PROCEDURE upgrade_10 () AS $$
DECLARE
row RECORD;
BEGIN
FOR row IN
SELECT schema_name FROM information_schema.schemata
WHERE schema_name LIKE 'survey_%'
ORDER BY schema_name
LOOP
CALL migrate_hashes(row.schema_name);
END LOOP;
END;
$$ LANGUAGE plpgsql;
CALL upgrade_10();
CALL show_notice('Cleaning up');
DROP PROCEDURE migrate_hashes (schema_name text);
DROP PROCEDURE upgrade_10 ();
CALL show_notice('Updating db_schema version');
INSERT INTO public.info VALUES ('version', '{"db_schema": "0.2.0"}')
ON CONFLICT (key) DO UPDATE
SET value = public.info.value || '{"db_schema": "0.2.0"}' WHERE public.info.key = 'version';
CALL show_notice('All done. You may now run "COMMIT;" to persist the changes');
DROP PROCEDURE show_notice (notice text);
--
--NOTE Run `COMMIT;` now if all went well
--

View File

@@ -11,7 +11,7 @@
"license": "UNLICENSED",
"private": true,
"config": {
"db_schema": "^0.1.0"
"db_schema": "^0.2.0"
},
"engines": {
"node": ">=14.0.0"