Use digests rather than timestamps for QC execution.

Using timestamps does not work as we might be
importing files with timestamps older than the
last QC run. Those would not be detected by a
timestamp based method but would be by this
digest based approach.

There is a project-wide digest and per sequence
digests. The former takes the path and hashes of
all files known to Dougal for this project (the
`files` table), concatenantes them and computes
the MD5 checksum. Sequence digests do the same
but only including the files related to that
sequence.
This commit is contained in:
D. Berge
2022-03-17 18:32:09 +01:00
parent 21d5383882
commit a592ab5f6c
2 changed files with 107 additions and 14 deletions

View File

@@ -5,7 +5,7 @@ const Cursor = require('pg-cursor');
const { pool, setSurvey, transaction, fetchRow } = require('../db/connection')
const { project, sequence, configuration, info } = require('../db')
const flattenQCDefinitions = require('./flatten');
const { projectLastModified, sequenceLastModified } = require('./last-modified');
const { projectHash, sequenceHash } = require('./last-modified');
const { runShotsQC, saveShotsQC } = require('./shots');
const { runSequenceQCs, saveSequenceQCs } = require('./sequences');
@@ -35,15 +35,14 @@ async function main () {
if (!project.archived) {
const QCTstamp = new Date();
const projectTstamp = await projectLastModified(projectId);
const updatedOn = await info.get(projectId, "qc/updatedOn");
const lastQCTstamp = isNaN(new Date(updatedOn)) ? -Infinity : new Date(updatedOn);
console.log("QCTstamp", QCTstamp);
console.log("projectTstamp", projectTstamp);
console.log("lastQCTstamp", lastQCTstamp);
if (projectTstamp >= lastQCTstamp) {
console.log("projectTstamp >= lastQCTstamp", projectId, projectTstamp, lastQCTstamp, projectTstamp >= lastQCTstamp);
const currentQCHash = await projectHash(projectId);
const lastQCHash = await info.get(projectId, "qc/hash");
console.log("projectHash", projectHash);
console.log("lastQCHash", lastQCHash);
if (currentQCHash != lastQCHash) {
console.log("currentQCHash != lastQCHash", projectId, currentQCHash, lastQCHash);
// Fetch definitions and parameters
const { definitions, parameters } = await getProjectQCConfig(projectId);
@@ -60,14 +59,17 @@ async function main () {
// Run shot QCs
for (const seq of sequences) {
const sequenceNumber = seq.sequence;
const sequenceTstamp = await sequenceLastModified(projectId, sequenceNumber);
const sequenceCurrentHash = await sequenceHash(projectId, sequenceNumber);
const sequenceLastQCHash = seq.meta?.lastQCHash;
console.log("sequenceCurrentHash", sequenceCurrentHash);
console.log("sequenceLastQCHash", sequenceLastQCHash);
if (sequenceTstamp >= lastQCTstamp) {
if (sequenceCurrentHash != sequenceLastQCHash) {
const results = await runShotsQC(projectId, sequenceNumber, shotQCs, parameters);
await saveShotsQC(projectId, {[sequenceNumber]: results});
// console.log("Saved", sequenceNumber);
await sequenceHash(projectId, sequenceNumber, sequenceCurrentHash);
} else {
console.log("NOT MODIFIED: SEQ", sequenceNumber);
@@ -80,7 +82,7 @@ async function main () {
// Run survey-wide QCs TODO maybe
await info.put(projectId, "qc", {updatedOn: QCTstamp}, {}, null);
await info.put(projectId, "qc", {updatedOn: QCTstamp, hash: currentQCHash}, {}, null);
}
}
}

View File

@@ -38,7 +38,98 @@ async function sequenceLastModified (projectId, sequence) {
return res;
}
/** Return or save a hash representing the state of the project.
*
* The hash is an MD5 digest from the concatenation of all
* file paths + file hashes known to Dougal for a given
* project.
*
* The idea is that this should change every time a new
* file is imported or when an existing file is changed
* or deleted.
*
* Going only by file timestamp does not work as we may
* be importing files with timestamps older than the last
* QC run.
*
* @a projectId The ID of the project to operate on.
* @a hash If present, sets info.qc->'hash' to this value,
* if absent, return a hash.
*/
async function projectHash (projectId, hash) {
const client = await setSurvey(projectId);
if (hash) {
const text = `
INSERT INTO info (key, value)
VALUES ('qc', json_build_object('hash', to_jsonb($1::text)))
ON CONFLICT (key)
DO UPDATE
SET value = jsonb_set(info.value, ARRAY['hash'], to_jsonb($1::text));
`;
const values = [hash];
const res = await client.query(text, values);
await client.release();
return res;
} else {
const text = `
SELECT md5(text) hash FROM (
SELECT string_agg(path || E'\t' || hash, E'\n') AS text
FROM files
) AS t;
`;
const res = ((await client.query(text))?.rows ?? [])[0]?.hash;
await client.release();
return res;
}
}
/** Return or save a hash representing the state of a sequence.
*
* Analogous to projectHash() but for a specific sequence.
*/
async function sequenceHash (projectId, sequence, hash) {
const client = await setSurvey(projectId);
if (hash) {
const text = `
UPDATE raw_lines
SET
meta = jsonb_set(meta, array['lastQCHash'], to_jsonb($2::text))
WHERE sequence = $1;
`;
const values = [ sequence, hash ];
const res = await client.query(text, values);
await client.release();
return res;
} else {
const text = `
SELECT sequence, md5(string_agg(hash, E'\t')) AS hash
FROM (
SELECT *
FROM raw_lines_files
UNION SELECT *
FROM final_lines_files
) AS t
GROUP BY sequence
HAVING sequence = $1;
`;
const values = [ sequence ];
const res = ((await client.query(text, values))?.rows ?? [])[0]?.hash;
await client.release();
return res;
}
}
module.exports = {
projectLastModified,
sequenceLastModified
sequenceLastModified,
projectHash,
sequenceHash
};