Modify file hashing function.

We remove the inode from the hash as it is unstable when the
files are on an SMB filesystem, and replace it with an MD5
of the absolute file path.
This commit is contained in:
D. Berge
2022-02-07 15:51:38 +01:00
parent 75f91a9553
commit 53ed096e1b

View File

@@ -4,6 +4,7 @@ import psycopg2
import configuration
import preplots
import p111
from hashlib import md5 # Because it's good enough
"""
Interface to the PostgreSQL database.
@@ -11,13 +12,16 @@ Interface to the PostgreSQL database.
def file_hash(file):
"""
Calculate a file hash based on its size, inode, modification and creation times.
Calculate a file hash based on its name, size, modification and creation times.
The hash is used to uniquely identify files in the database and detect if they
have changed.
"""
h = md5()
h.update(file.encode())
name_digest = h.hexdigest()[:16]
st = os.stat(file)
return ":".join([str(v) for v in [st.st_size, st.st_mtime, st.st_ctime, st.st_ino]])
return ":".join([str(v) for v in [st.st_size, st.st_mtime, st.st_ctime, name_digest]])
class Datastore:
"""
@@ -390,9 +394,9 @@ class Datastore:
with self.conn.cursor() as cursor:
cursor.execute("BEGIN;")
hash = self.add_file(filepath, cursor)
if not records or len(records) == 0:
print("File has no records (or none have been detected)")
# We add the file to the database anyway to signal that we have
@@ -412,13 +416,13 @@ class Datastore:
"""
cursor.execute(qry, (fileinfo["sequence"], fileinfo["line"], ntbp, incr, json.dumps(fileinfo["meta"])))
qry = """
UPDATE raw_lines
SET meta = meta || %s
WHERE sequence = %s;
"""
cursor.execute(qry, (json.dumps(fileinfo["meta"]), fileinfo["sequence"]))
qry = """
@@ -452,7 +456,7 @@ class Datastore:
with self.conn.cursor() as cursor:
cursor.execute("BEGIN;")
hash = self.add_file(filepath, cursor)
qry = """
@@ -462,13 +466,13 @@ class Datastore:
"""
cursor.execute(qry, (fileinfo["sequence"], fileinfo["line"], json.dumps(fileinfo["meta"])))
qry = """
UPDATE raw_lines
SET meta = meta || %s
WHERE sequence = %s;
"""
cursor.execute(qry, (json.dumps(fileinfo["meta"]), fileinfo["sequence"]))
qry = """
@@ -495,7 +499,7 @@ class Datastore:
if filedata is not None:
self.save_file_data(filepath, json.dumps(filedata), cursor)
cursor.execute("CALL final_line_post_import(%s);", (fileinfo["sequence"],))
self.maybe_commit()
@@ -662,7 +666,7 @@ class Datastore:
"""
Remove final data for a sequence.
"""
if cursor is None:
cur = self.conn.cursor()
else:
@@ -674,4 +678,4 @@ class Datastore:
self.maybe_commit()
# We do not commit if we've been passed a cursor, instead
# we assume that we are in the middle of a transaction