Files
dougal-software/lib/www/server/lib/comparisons/utils.js
2025-08-18 13:53:43 +02:00

311 lines
10 KiB
JavaScript
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

const d3 = require('d3-array');
// Function to calculate the root mean square (RMS) of position deviations
// This computes the RMS of the Euclidean distances: sqrt( (1/n) * sum(δi² + δj²) )
// Assumes deviations are already centered (mean deviation ~0); if normalization by std dev or range is needed, adjust accordingly
function ijRMS(δi, δj) {
if (!δi.length || !δj.length) return 0;
if (δi.length != δj.length) {
console.warn(`δi and δj have different lengths!`);
}
let sumSquares = 0;
const n = Math.min(δi.length, δj.length);
for (let i=0; i < n; i++) {
sumSquares += (δi[i] * δi[i]) + (δj[i] * δj[i]);
}
const meanSquare = sumSquares / n;
const rms = Math.sqrt(meanSquare);
return rms;
}
/**
* Performs stratified sampling on an array of [line, point, δi, δj] data points.
* Groups by line and samples proportionally to preserve shape and spread.
*
* @param {Array<Array<number>>} data - Input data: [[line, point, δi, δj], ...]
* @param {number} sampleSize - Target number of samples (e.g., 2000)
* @returns {Array<Array<number>>} Sampled data in same format
*/
function old_stratifiedSample(data, sampleSize) {
if (!Array.isArray(data) || data.length === 0) return [];
if (!Number.isInteger(sampleSize) || sampleSize <= 0) {
throw new Error('sampleSize must be a positive integer');
}
// Group data by line (first element)
const grouped = d3.group(data, d => d[0]);
const totalSize = data.length;
const sampled = [];
// Ensure sampleSize doesn't exceed data size
const effectiveSampleSize = Math.min(sampleSize, totalSize);
// Iterate over each line group
for (const [line, group] of grouped) {
// Calculate proportional sample size for this group
const groupSize = group.length;
const groupSampleSize = Math.max(1, Math.round((groupSize / totalSize) * effectiveSampleSize));
// Shuffle group and take first N elements
const shuffled = d3.shuffle([...group]);
sampled.push(...shuffled.slice(0, groupSampleSize));
}
// If sampled size is slightly off due to rounding, adjust
if (sampled.length > effectiveSampleSize) {
return d3.shuffle(sampled).slice(0, effectiveSampleSize);
} else if (sampled.length < effectiveSampleSize) {
// Pad with random samples from entire dataset if needed
const remaining = effectiveSampleSize - sampled.length;
const additional = d3.shuffle(data.filter(d => !sampled.includes(d))).slice(0, remaining);
sampled.push(...additional);
}
return sampled;
}
/**
* Performs stratified sampling on an array of [line, point, δi, δj] data points.
* Stratifies by line and δi quantiles to preserve shape and spread, with outlier control.
*
* @param {Array<Array<number>>} data - Input data: [[line, point, δi, δj], ...]
* @param {number} sampleSize - Target number of samples (e.g., 2000)
* @param {number} [binsPerLine=10] - Number of δi quantile bins per line
* @returns {Array<Array<number>>} Sampled data in same format
*/
function stratifiedSample(data, sampleSize, binsPerLine = 10) {
if (!Array.isArray(data) || data.length === 0) return [];
if (!Number.isInteger(sampleSize) || sampleSize <= 0) {
throw new Error('sampleSize must be a positive integer');
}
if (!Number.isInteger(binsPerLine) || binsPerLine <= 0) {
throw new Error('binsPerLine must be a positive integer');
}
const totalSize = data.length;
const effectiveSampleSize = Math.min(sampleSize, totalSize);
const sampled = [];
// Group by line
const groupedByLine = d3.group(data, d => d[0]);
// Compute population stats for validation
const populationStats = computeStats(data);
// Iterate over each line
for (const [line, group] of groupedByLine) {
const groupSize = group.length;
const lineSampleSize = Math.max(1, Math.round((groupSize / totalSize) * effectiveSampleSize));
// Create quantile-based bins for δi
const δiValues = group.map(d => d[2]).sort(d3.ascending);
const quantiles = d3.range(0, binsPerLine + 1).map(i => d3.quantile(δiValues, i / binsPerLine));
const binnedData = group.map(d => {
const δi = d[2];
let binIndex = 0;
for (let i = 0; i < binsPerLine; i++) {
if (δi >= quantiles[i] && δi < quantiles[i + 1]) {
binIndex = i;
break;
}
}
return { data: d, bin: binIndex };
});
const groupedByBin = d3.group(binnedData, d => d.bin);
// Allocate samples across bins, inversely weighted by density to control outliers
const binSampleSizes = new Map();
let remainingLineSamples = lineSampleSize;
const binCounts = Array(binsPerLine).fill(0);
for (const [bin, binGroup] of groupedByBin) {
binCounts[bin] = binGroup.length;
}
const maxBinCount = d3.max(binCounts);
for (const [bin, binGroup] of groupedByBin) {
const binSize = binGroup.length;
// Inverse weighting: smaller bins (outliers) get fewer samples
const weight = binSize > 0 ? Math.max(0.1, 1 - (binSize / maxBinCount) * 0.5) : 1;
const binSampleSize = Math.max(1, Math.round(lineSampleSize * (binSize / groupSize) * weight));
binSampleSizes.set(bin, Math.min(binSampleSize, binSize));
remainingLineSamples -= binSampleSizes.get(bin);
}
// Distribute remaining samples
if (remainingLineSamples > 0) {
const nonEmptyBins = Array.from(groupedByBin.keys());
for (let i = 0; i < remainingLineSamples && nonEmptyBins.length > 0; i++) {
const bin = nonEmptyBins[i % nonEmptyBins.length];
binSampleSizes.set(bin, binSampleSizes.get(bin) + 1);
}
}
// Sample from each bin
for (const [bin, binGroup] of groupedByBin) {
const samples = d3.shuffle([...binGroup]).slice(0, binSampleSizes.get(bin)).map(s => s.data);
sampled.push(...samples);
}
}
// Adjust sample size
let finalSample = sampled;
if (sampled.length > effectiveSampleSize) {
finalSample = d3.shuffle(sampled).slice(0, effectiveSampleSize);
} else if (sampled.length < effectiveSampleSize) {
const remaining = effectiveSampleSize - sampled.length;
const additional = d3.shuffle(data.filter(d => !sampled.includes(d))).slice(0, remaining);
finalSample = [...sampled, ...additional];
}
// Validate and adjust if stats are off
const sampleStats = computeStats(finalSample);
const statTolerance = { μ: 0.1, σ: 0.2 }; // Allowable relative deviation
const needsAdjustment =
Math.abs(sampleStats.μ[0] - populationStats.μ[0]) / populationStats.μ[0] > statTolerance.μ ||
Math.abs(sampleStats.μ[1] - populationStats.μ[1]) / populationStats.μ[1] > statTolerance.μ ||
Math.abs(sampleStats.σ[0] - populationStats.σ[0]) / populationStats.σ[0] > statTolerance.σ ||
Math.abs(sampleStats.σ[1] - populationStats.σ[1]) / populationStats.σ[1] > statTolerance.σ;
if (needsAdjustment) {
// Add points from underrepresented regions
const δiSample = finalSample.map(d => d[2]);
const δiPopulation = data.map(d => d[2]);
const quantiles = d3.range(0, binsPerLine + 1).map(i => d3.quantile(δiPopulation, i / binsPerLine));
const sampleBins = d3.histogram().domain(d3.extent(δiPopulation)).thresholds(quantiles)(δiSample);
const populationBins = d3.histogram().domain(d3.extent(δiPopulation)).thresholds(quantiles)(δiPopulation);
const underSampledBins = sampleBins
.map((b, i) => ({ bin: i, diff: populationBins[i].length / totalSize - b.length / finalSample.length }))
.filter(b => b.diff > 0.1); // Significant under-sampling
if (underSampledBins.length > 0) {
const additionalSamples = [];
for (const { bin } of underSampledBins) {
const binData = data.filter(d => d[2] >= quantiles[bin] && d[2] < quantiles[bin + 1] && !finalSample.includes(d));
const needed = Math.round((underSampledBins[0].diff * effectiveSampleSize) / 2);
additionalSamples.push(...d3.shuffle(binData).slice(0, needed));
}
finalSample = d3.shuffle([...finalSample, ...additionalSamples]).slice(0, effectiveSampleSize);
}
}
return finalSample;
}
function decimate (data, decimationCount = 20) {
return data.filter( (row, index) => (index % decimationCount) == 0 );
}
function computeSample (data, opts = {}) {
const DEFAULT_SAMPLE_SIZE = 2000;
let sample;
if (opts.decimate === true) {
if (opts.sampleSize > 0) {
sample = decimate(data.records, Math.floor(data.records.length / opts.sampleSize));
} else {
sample = decimate(data.records, Math.floor(data.records.length / DEFAULT_SAMPLE_SIZE));
}
} else if (opts.decimate > 0) {
sample = decimate(data.records, opts.decimate);
} else if (opts.sampleSize) {
sample = stratifiedSample(data.records, opt.sampleSize);
} else {
sample = stratifiedSample(data.records, DEFAULT_SAMPLE_SIZE);
}
return sample;
}
// Optional: Utility to compute stats for validation
function computeStats(data) {
const δi = data.map(d => d[2]);
const δj = data.map(d => d[3]);
const rms = Math.sqrt(d3.mean(data, d => d[2] ** 2 + d[3] ** 2));
return {
l: data.length,
μ: [d3.mean(δi), d3.mean(δj)],
σ: [d3.deviation(δi), d3.deviation(δj)],
rms
};
}
function centre (data) {
const stats = computeStats(data);
return data.map( row => [row[0], row[1], row[2]-stats.μ[0], row[3]-stats.μ[1]] )
}
function outliers (data, sd=1.96) {
const stats = computeStats(data);
function fn ([l, p, i, j]) {
return (i - stats.μ[0]) > stats.σ[0]*sd ||
(j - stats.μ[1]) > stats.σ[1]*sd;
}
return data.filter(fn)
}
function inliers (data, sd=1.96) {
const stats = computeStats(data);
function fn ([l, p, i, j]) {
return (i - stats.μ[0]) <= stats.σ[0]*sd &&
(j - stats.μ[1]) <= stats.σ[1]*sd;
}
return data.filter(fn)
}
function difference (a, b) {
const obj = Array.isArray(a) ? [] : {};
for (const k in a) {
const v0 = a[k];
const v1 = b[k]
if (v0 instanceof Object && v1 instanceof Object) {
obj[k] = difference (v0, v1);
} else if (!isNaN(Number(v0)) && !isNaN(Number(v1))) {
obj[k] = v1 - v0;
}
}
return obj;
}
function combinations (a, n) {
const results = [];
function combine(current, start) {
if (current.length === n) {
results.push([...current]);
return;
}
for (let i = start; i < a.length; i++) {
current.push(a[i]);
combine(current, i + 1);
current.pop();
}
}
combine([], 0);
return results;
}
module.exports = {
combinations,
centre,
ijRMS,
computeStats,
computeSample,
stratifiedSample,
old_stratifiedSample,
decimate,
difference,
outliers,
inliers
}