dougal-software/lib/www/server/lib/comparisons/utils.js

const d3 = require('d3-array');

// Function to calculate the root mean square (RMS) of position deviations
// This computes the RMS of the Euclidean distances: sqrt( (1/n) * sum(δi² + δj²) )
// Assumes deviations are already centered (mean deviation ~0); if normalization by std dev or range is needed, adjust accordingly
function ijRMS(δi, δj) {
	if (!δi.length || !δj.length) return 0;

	if (δi.length != δj.length) {
		console.warn(`δi and δj have different lengths!`);
	}

	let sumSquares = 0;
	const n = Math.min(δi.length, δj.length);

	for (let i=0; i < n; i++) {
		sumSquares += (δi[i] * δi[i]) + (δj[i] * δj[i]);
	}

	const meanSquare = sumSquares / n;
	const rms = Math.sqrt(meanSquare);

	return rms;
}


/**
 * Performs stratified sampling on an array of [line, point, δi, δj] data points.
 * Groups by line and samples proportionally to preserve shape and spread.
 *
 * @param {Array<Array<number>>} data - Input data: [[line, point, δi, δj], ...]
 * @param {number} sampleSize - Target number of samples (e.g., 2000)
 * @returns {Array<Array<number>>} Sampled data in same format
 */
function old_stratifiedSample(data, sampleSize) {
	if (!Array.isArray(data) || data.length === 0) return [];
	if (!Number.isInteger(sampleSize) || sampleSize <= 0) {
		throw new Error('sampleSize must be a positive integer');
	}

	// Group data by line (first element)
	const grouped = d3.group(data, d => d[0]);
	const totalSize = data.length;
	const sampled = [];

	// Ensure sampleSize doesn't exceed data size
	const effectiveSampleSize = Math.min(sampleSize, totalSize);

	// Iterate over each line group
	for (const [line, group] of grouped) {
		// Calculate proportional sample size for this group
		const groupSize = group.length;
		const groupSampleSize = Math.max(1, Math.round((groupSize / totalSize) * effectiveSampleSize));

		// Shuffle group and take first N elements
		const shuffled = d3.shuffle([...group]);
		sampled.push(...shuffled.slice(0, groupSampleSize));
	}

	// If sampled size is slightly off due to rounding, adjust
	if (sampled.length > effectiveSampleSize) {
		return d3.shuffle(sampled).slice(0, effectiveSampleSize);
	} else if (sampled.length < effectiveSampleSize) {
		// Pad with random samples from entire dataset if needed
		const remaining = effectiveSampleSize - sampled.length;
		const additional = d3.shuffle(data.filter(d => !sampled.includes(d))).slice(0, remaining);
		sampled.push(...additional);
	}

	return sampled;
}

/**
 * Performs stratified sampling on an array of [line, point, δi, δj] data points.
 * Stratifies by line and δi quantiles to preserve shape and spread, with outlier control.
 *
 * @param {Array<Array<number>>} data - Input data: [[line, point, δi, δj], ...]
 * @param {number} sampleSize - Target number of samples (e.g., 2000)
 * @param {number} [binsPerLine=10] - Number of δi quantile bins per line
 * @returns {Array<Array<number>>} Sampled data in same format
 */
function stratifiedSample(data, sampleSize, binsPerLine = 10) {
	if (!Array.isArray(data) || data.length === 0) return [];
	if (!Number.isInteger(sampleSize) || sampleSize <= 0) {
		throw new Error('sampleSize must be a positive integer');
	}
	if (!Number.isInteger(binsPerLine) || binsPerLine <= 0) {
		throw new Error('binsPerLine must be a positive integer');
	}

	const totalSize = data.length;
	const effectiveSampleSize = Math.min(sampleSize, totalSize);
	const sampled = [];

	// Group by line
	const groupedByLine = d3.group(data, d => d[0]);

	// Compute population stats for validation
	const populationStats = computeStats(data);

	// Iterate over each line
	for (const [line, group] of groupedByLine) {
		const groupSize = group.length;
		const lineSampleSize = Math.max(1, Math.round((groupSize / totalSize) * effectiveSampleSize));

		// Create quantile-based bins for δi
		const δiValues = group.map(d => d[2]).sort(d3.ascending);
		const quantiles = d3.range(0, binsPerLine + 1).map(i => d3.quantile(δiValues, i / binsPerLine));
		const binnedData = group.map(d => {
			const δi = d[2];
			let binIndex = 0;
			for (let i = 0; i < binsPerLine; i++) {
				if (δi >= quantiles[i] && δi < quantiles[i + 1]) {
					binIndex = i;
					break;
				}
			}
			return { data: d, bin: binIndex };
		});
		const groupedByBin = d3.group(binnedData, d => d.bin);

		// Allocate samples across bins, inversely weighted by density to control outliers
		const binSampleSizes = new Map();
		let remainingLineSamples = lineSampleSize;
		const binCounts = Array(binsPerLine).fill(0);
		for (const [bin, binGroup] of groupedByBin) {
			binCounts[bin] = binGroup.length;
		}
		const maxBinCount = d3.max(binCounts);
		for (const [bin, binGroup] of groupedByBin) {
			const binSize = binGroup.length;
			// Inverse weighting: smaller bins (outliers) get fewer samples
			const weight = binSize > 0 ? Math.max(0.1, 1 - (binSize / maxBinCount) * 0.5) : 1;
			const binSampleSize = Math.max(1, Math.round(lineSampleSize * (binSize / groupSize) * weight));
			binSampleSizes.set(bin, Math.min(binSampleSize, binSize));
			remainingLineSamples -= binSampleSizes.get(bin);
		}

		// Distribute remaining samples
		if (remainingLineSamples > 0) {
			const nonEmptyBins = Array.from(groupedByBin.keys());
			for (let i = 0; i < remainingLineSamples && nonEmptyBins.length > 0; i++) {
				const bin = nonEmptyBins[i % nonEmptyBins.length];
				binSampleSizes.set(bin, binSampleSizes.get(bin) + 1);
			}
		}

		// Sample from each bin
		for (const [bin, binGroup] of groupedByBin) {
			const samples = d3.shuffle([...binGroup]).slice(0, binSampleSizes.get(bin)).map(s => s.data);
			sampled.push(...samples);
		}
	}

	// Adjust sample size
	let finalSample = sampled;
	if (sampled.length > effectiveSampleSize) {
		finalSample = d3.shuffle(sampled).slice(0, effectiveSampleSize);
	} else if (sampled.length < effectiveSampleSize) {
		const remaining = effectiveSampleSize - sampled.length;
		const additional = d3.shuffle(data.filter(d => !sampled.includes(d))).slice(0, remaining);
		finalSample = [...sampled, ...additional];
	}

	// Validate and adjust if stats are off
	const sampleStats = computeStats(finalSample);
	const statTolerance = { μ: 0.1, σ: 0.2 }; // Allowable relative deviation
	const needsAdjustment =
		Math.abs(sampleStats.μ[0] - populationStats.μ[0]) / populationStats.μ[0] > statTolerance.μ ||
		Math.abs(sampleStats.μ[1] - populationStats.μ[1]) / populationStats.μ[1] > statTolerance.μ ||
		Math.abs(sampleStats.σ[0] - populationStats.σ[0]) / populationStats.σ[0] > statTolerance.σ ||
		Math.abs(sampleStats.σ[1] - populationStats.σ[1]) / populationStats.σ[1] > statTolerance.σ;

	if (needsAdjustment) {
		// Add points from underrepresented regions
		const δiSample = finalSample.map(d => d[2]);
		const δiPopulation = data.map(d => d[2]);
		const quantiles = d3.range(0, binsPerLine + 1).map(i => d3.quantile(δiPopulation, i / binsPerLine));
		const sampleBins = d3.histogram().domain(d3.extent(δiPopulation)).thresholds(quantiles)(δiSample);
		const populationBins = d3.histogram().domain(d3.extent(δiPopulation)).thresholds(quantiles)(δiPopulation);
		const underSampledBins = sampleBins
			.map((b, i) => ({ bin: i, diff: populationBins[i].length / totalSize - b.length / finalSample.length }))
			.filter(b => b.diff > 0.1); // Significant under-sampling

		if (underSampledBins.length > 0) {
			const additionalSamples = [];
			for (const { bin } of underSampledBins) {
				const binData = data.filter(d => d[2] >= quantiles[bin] && d[2] < quantiles[bin + 1] && !finalSample.includes(d));
				const needed = Math.round((underSampledBins[0].diff * effectiveSampleSize) / 2);
				additionalSamples.push(...d3.shuffle(binData).slice(0, needed));
			}
			finalSample = d3.shuffle([...finalSample, ...additionalSamples]).slice(0, effectiveSampleSize);
		}
	}

	return finalSample;
}

function decimate (data, decimationCount = 20) {
	return data.filter( (row, index) => (index % decimationCount) == 0 );
}

function computeSample (data, opts = {}) {
	const DEFAULT_SAMPLE_SIZE = 2000;
	let sample;

	if (opts.decimate === true) {
		if (opts.sampleSize > 0) {
			sample = decimate(data.records, Math.floor(data.records.length / opts.sampleSize));
		} else {
			sample = decimate(data.records, Math.floor(data.records.length / DEFAULT_SAMPLE_SIZE));
		}
	} else if (opts.decimate > 0) {
		sample = decimate(data.records, opts.decimate);
	} else if (opts.sampleSize) {
		sample = stratifiedSample(data.records, opt.sampleSize);
	} else {
		sample = stratifiedSample(data.records, DEFAULT_SAMPLE_SIZE);
	}

	return sample;
}

// Optional: Utility to compute stats for validation
function computeStats(data) {
	const δi = data.map(d => d[2]);
	const δj = data.map(d => d[3]);
	const rms = Math.sqrt(d3.mean(data, d => d[2] ** 2 + d[3] ** 2));
	return {
		l: data.length,
		μ: [d3.mean(δi), d3.mean(δj)],
		σ: [d3.deviation(δi), d3.deviation(δj)],
		rms
	};
}

function centre (data) {
	const stats = computeStats(data);

	return data.map( row => [row[0], row[1], row[2]-stats.μ[0], row[3]-stats.μ[1]] )
}

function outliers (data, sd=1.96) {
	const stats = computeStats(data);

	function fn ([l, p, i, j]) {
		return (i - stats.μ[0]) > stats.σ[0]*sd ||
			(j - stats.μ[1]) > stats.σ[1]*sd;
	}

	return data.filter(fn)
}

function inliers (data, sd=1.96) {
	const stats = computeStats(data);

	function fn ([l, p, i, j]) {
		return (i - stats.μ[0]) <= stats.σ[0]*sd &&
			(j - stats.μ[1]) <= stats.σ[1]*sd;
	}

	return data.filter(fn)
}

function difference (a, b) {
	const obj = Array.isArray(a) ? [] : {};
	for (const k in a) {
		const v0 = a[k];
		const v1 = b[k]
		if (v0 instanceof Object && v1 instanceof Object) {
			obj[k] = difference (v0, v1);
		} else if (!isNaN(Number(v0)) && !isNaN(Number(v1))) {
			obj[k] = v1 - v0;
		}
	}
	return obj;
}

function combinations (a, n) {
	const results = [];

	function combine(current, start) {
		if (current.length === n) {
			results.push([...current]);
			return;
		}
		for (let i = start; i < a.length; i++) {
			current.push(a[i]);
			combine(current, i + 1);
			current.pop();
		}
	}

	combine([], 0);
	return results;
}

module.exports = {
	combinations,
	centre,
	ijRMS,
	computeStats,
	computeSample,
	stratifiedSample,
	old_stratifiedSample,
	decimate,
	difference,
	outliers,
	inliers
}