From a592ab5f6c50d7811456edae5412d9d659ca5fea Mon Sep 17 00:00:00 2001 From: "D. Berge" Date: Thu, 17 Mar 2022 18:32:09 +0100 Subject: [PATCH] Use digests rather than timestamps for QC execution. Using timestamps does not work as we might be importing files with timestamps older than the last QC run. Those would not be detected by a timestamp based method but would be by this digest based approach. There is a project-wide digest and per sequence digests. The former takes the path and hashes of all files known to Dougal for this project (the `files` table), concatenantes them and computes the MD5 checksum. Sequence digests do the same but only including the files related to that sequence. --- lib/www/server/lib/qc/index.js | 28 ++++---- lib/www/server/lib/qc/last-modified.js | 93 +++++++++++++++++++++++++- 2 files changed, 107 insertions(+), 14 deletions(-) diff --git a/lib/www/server/lib/qc/index.js b/lib/www/server/lib/qc/index.js index 1657f43..7b444bf 100644 --- a/lib/www/server/lib/qc/index.js +++ b/lib/www/server/lib/qc/index.js @@ -5,7 +5,7 @@ const Cursor = require('pg-cursor'); const { pool, setSurvey, transaction, fetchRow } = require('../db/connection') const { project, sequence, configuration, info } = require('../db') const flattenQCDefinitions = require('./flatten'); -const { projectLastModified, sequenceLastModified } = require('./last-modified'); +const { projectHash, sequenceHash } = require('./last-modified'); const { runShotsQC, saveShotsQC } = require('./shots'); const { runSequenceQCs, saveSequenceQCs } = require('./sequences'); @@ -35,15 +35,14 @@ async function main () { if (!project.archived) { const QCTstamp = new Date(); - const projectTstamp = await projectLastModified(projectId); - const updatedOn = await info.get(projectId, "qc/updatedOn"); - const lastQCTstamp = isNaN(new Date(updatedOn)) ? -Infinity : new Date(updatedOn); - console.log("QCTstamp", QCTstamp); - console.log("projectTstamp", projectTstamp); - console.log("lastQCTstamp", lastQCTstamp); - if (projectTstamp >= lastQCTstamp) { - console.log("projectTstamp >= lastQCTstamp", projectId, projectTstamp, lastQCTstamp, projectTstamp >= lastQCTstamp); + const currentQCHash = await projectHash(projectId); + const lastQCHash = await info.get(projectId, "qc/hash"); + console.log("projectHash", projectHash); + console.log("lastQCHash", lastQCHash); + + if (currentQCHash != lastQCHash) { + console.log("currentQCHash != lastQCHash", projectId, currentQCHash, lastQCHash); // Fetch definitions and parameters const { definitions, parameters } = await getProjectQCConfig(projectId); @@ -60,14 +59,17 @@ async function main () { // Run shot QCs for (const seq of sequences) { const sequenceNumber = seq.sequence; - const sequenceTstamp = await sequenceLastModified(projectId, sequenceNumber); + const sequenceCurrentHash = await sequenceHash(projectId, sequenceNumber); + const sequenceLastQCHash = seq.meta?.lastQCHash; + console.log("sequenceCurrentHash", sequenceCurrentHash); + console.log("sequenceLastQCHash", sequenceLastQCHash); - if (sequenceTstamp >= lastQCTstamp) { + if (sequenceCurrentHash != sequenceLastQCHash) { const results = await runShotsQC(projectId, sequenceNumber, shotQCs, parameters); await saveShotsQC(projectId, {[sequenceNumber]: results}); -// console.log("Saved", sequenceNumber); + await sequenceHash(projectId, sequenceNumber, sequenceCurrentHash); } else { console.log("NOT MODIFIED: SEQ", sequenceNumber); @@ -80,7 +82,7 @@ async function main () { // Run survey-wide QCs TODO maybe - await info.put(projectId, "qc", {updatedOn: QCTstamp}, {}, null); + await info.put(projectId, "qc", {updatedOn: QCTstamp, hash: currentQCHash}, {}, null); } } } diff --git a/lib/www/server/lib/qc/last-modified.js b/lib/www/server/lib/qc/last-modified.js index 05c3de9..b07c6fc 100644 --- a/lib/www/server/lib/qc/last-modified.js +++ b/lib/www/server/lib/qc/last-modified.js @@ -38,7 +38,98 @@ async function sequenceLastModified (projectId, sequence) { return res; } +/** Return or save a hash representing the state of the project. + * + * The hash is an MD5 digest from the concatenation of all + * file paths + file hashes known to Dougal for a given + * project. + * + * The idea is that this should change every time a new + * file is imported or when an existing file is changed + * or deleted. + * + * Going only by file timestamp does not work as we may + * be importing files with timestamps older than the last + * QC run. + * + * @a projectId The ID of the project to operate on. + * @a hash If present, sets info.qc->'hash' to this value, + * if absent, return a hash. + */ +async function projectHash (projectId, hash) { + const client = await setSurvey(projectId); + + if (hash) { + const text = ` + INSERT INTO info (key, value) + VALUES ('qc', json_build_object('hash', to_jsonb($1::text))) + ON CONFLICT (key) + DO UPDATE + SET value = jsonb_set(info.value, ARRAY['hash'], to_jsonb($1::text)); + `; + + const values = [hash]; + const res = await client.query(text, values); + await client.release(); + return res; + } else { + const text = ` + SELECT md5(text) hash FROM ( + SELECT string_agg(path || E'\t' || hash, E'\n') AS text + FROM files + ) AS t; + `; + + const res = ((await client.query(text))?.rows ?? [])[0]?.hash; + await client.release(); + return res; + } +} + +/** Return or save a hash representing the state of a sequence. + * + * Analogous to projectHash() but for a specific sequence. + */ +async function sequenceHash (projectId, sequence, hash) { + const client = await setSurvey(projectId); + + if (hash) { + const text = ` + UPDATE raw_lines + SET + meta = jsonb_set(meta, array['lastQCHash'], to_jsonb($2::text)) + WHERE sequence = $1; + `; + + const values = [ sequence, hash ]; + + const res = await client.query(text, values); + await client.release(); + return res; + } else { + const text = ` + SELECT sequence, md5(string_agg(hash, E'\t')) AS hash + FROM ( + SELECT * + FROM raw_lines_files + UNION SELECT * + FROM final_lines_files + ) AS t + GROUP BY sequence + HAVING sequence = $1; + `; + + const values = [ sequence ]; + + const res = ((await client.query(text, values))?.rows ?? [])[0]?.hash; + await client.release(); + return res; + } +} + module.exports = { projectLastModified, - sequenceLastModified + sequenceLastModified, + projectHash, + sequenceHash };