mirror of
https://gitlab.com/wgp/dougal/software.git
synced 2025-12-06 07:57:07 +00:00
311 lines
10 KiB
JavaScript
311 lines
10 KiB
JavaScript
const d3 = require('d3-array');
|
||
|
||
// Function to calculate the root mean square (RMS) of position deviations
|
||
// This computes the RMS of the Euclidean distances: sqrt( (1/n) * sum(δi² + δj²) )
|
||
// Assumes deviations are already centered (mean deviation ~0); if normalization by std dev or range is needed, adjust accordingly
|
||
function ijRMS(δi, δj) {
|
||
if (!δi.length || !δj.length) return 0;
|
||
|
||
if (δi.length != δj.length) {
|
||
console.warn(`δi and δj have different lengths!`);
|
||
}
|
||
|
||
let sumSquares = 0;
|
||
const n = Math.min(δi.length, δj.length);
|
||
|
||
for (let i=0; i < n; i++) {
|
||
sumSquares += (δi[i] * δi[i]) + (δj[i] * δj[i]);
|
||
}
|
||
|
||
const meanSquare = sumSquares / n;
|
||
const rms = Math.sqrt(meanSquare);
|
||
|
||
return rms;
|
||
}
|
||
|
||
|
||
/**
|
||
* Performs stratified sampling on an array of [line, point, δi, δj] data points.
|
||
* Groups by line and samples proportionally to preserve shape and spread.
|
||
*
|
||
* @param {Array<Array<number>>} data - Input data: [[line, point, δi, δj], ...]
|
||
* @param {number} sampleSize - Target number of samples (e.g., 2000)
|
||
* @returns {Array<Array<number>>} Sampled data in same format
|
||
*/
|
||
function old_stratifiedSample(data, sampleSize) {
|
||
if (!Array.isArray(data) || data.length === 0) return [];
|
||
if (!Number.isInteger(sampleSize) || sampleSize <= 0) {
|
||
throw new Error('sampleSize must be a positive integer');
|
||
}
|
||
|
||
// Group data by line (first element)
|
||
const grouped = d3.group(data, d => d[0]);
|
||
const totalSize = data.length;
|
||
const sampled = [];
|
||
|
||
// Ensure sampleSize doesn't exceed data size
|
||
const effectiveSampleSize = Math.min(sampleSize, totalSize);
|
||
|
||
// Iterate over each line group
|
||
for (const [line, group] of grouped) {
|
||
// Calculate proportional sample size for this group
|
||
const groupSize = group.length;
|
||
const groupSampleSize = Math.max(1, Math.round((groupSize / totalSize) * effectiveSampleSize));
|
||
|
||
// Shuffle group and take first N elements
|
||
const shuffled = d3.shuffle([...group]);
|
||
sampled.push(...shuffled.slice(0, groupSampleSize));
|
||
}
|
||
|
||
// If sampled size is slightly off due to rounding, adjust
|
||
if (sampled.length > effectiveSampleSize) {
|
||
return d3.shuffle(sampled).slice(0, effectiveSampleSize);
|
||
} else if (sampled.length < effectiveSampleSize) {
|
||
// Pad with random samples from entire dataset if needed
|
||
const remaining = effectiveSampleSize - sampled.length;
|
||
const additional = d3.shuffle(data.filter(d => !sampled.includes(d))).slice(0, remaining);
|
||
sampled.push(...additional);
|
||
}
|
||
|
||
return sampled;
|
||
}
|
||
|
||
/**
|
||
* Performs stratified sampling on an array of [line, point, δi, δj] data points.
|
||
* Stratifies by line and δi quantiles to preserve shape and spread, with outlier control.
|
||
*
|
||
* @param {Array<Array<number>>} data - Input data: [[line, point, δi, δj], ...]
|
||
* @param {number} sampleSize - Target number of samples (e.g., 2000)
|
||
* @param {number} [binsPerLine=10] - Number of δi quantile bins per line
|
||
* @returns {Array<Array<number>>} Sampled data in same format
|
||
*/
|
||
function stratifiedSample(data, sampleSize, binsPerLine = 10) {
|
||
if (!Array.isArray(data) || data.length === 0) return [];
|
||
if (!Number.isInteger(sampleSize) || sampleSize <= 0) {
|
||
throw new Error('sampleSize must be a positive integer');
|
||
}
|
||
if (!Number.isInteger(binsPerLine) || binsPerLine <= 0) {
|
||
throw new Error('binsPerLine must be a positive integer');
|
||
}
|
||
|
||
const totalSize = data.length;
|
||
const effectiveSampleSize = Math.min(sampleSize, totalSize);
|
||
const sampled = [];
|
||
|
||
// Group by line
|
||
const groupedByLine = d3.group(data, d => d[0]);
|
||
|
||
// Compute population stats for validation
|
||
const populationStats = computeStats(data);
|
||
|
||
// Iterate over each line
|
||
for (const [line, group] of groupedByLine) {
|
||
const groupSize = group.length;
|
||
const lineSampleSize = Math.max(1, Math.round((groupSize / totalSize) * effectiveSampleSize));
|
||
|
||
// Create quantile-based bins for δi
|
||
const δiValues = group.map(d => d[2]).sort(d3.ascending);
|
||
const quantiles = d3.range(0, binsPerLine + 1).map(i => d3.quantile(δiValues, i / binsPerLine));
|
||
const binnedData = group.map(d => {
|
||
const δi = d[2];
|
||
let binIndex = 0;
|
||
for (let i = 0; i < binsPerLine; i++) {
|
||
if (δi >= quantiles[i] && δi < quantiles[i + 1]) {
|
||
binIndex = i;
|
||
break;
|
||
}
|
||
}
|
||
return { data: d, bin: binIndex };
|
||
});
|
||
const groupedByBin = d3.group(binnedData, d => d.bin);
|
||
|
||
// Allocate samples across bins, inversely weighted by density to control outliers
|
||
const binSampleSizes = new Map();
|
||
let remainingLineSamples = lineSampleSize;
|
||
const binCounts = Array(binsPerLine).fill(0);
|
||
for (const [bin, binGroup] of groupedByBin) {
|
||
binCounts[bin] = binGroup.length;
|
||
}
|
||
const maxBinCount = d3.max(binCounts);
|
||
for (const [bin, binGroup] of groupedByBin) {
|
||
const binSize = binGroup.length;
|
||
// Inverse weighting: smaller bins (outliers) get fewer samples
|
||
const weight = binSize > 0 ? Math.max(0.1, 1 - (binSize / maxBinCount) * 0.5) : 1;
|
||
const binSampleSize = Math.max(1, Math.round(lineSampleSize * (binSize / groupSize) * weight));
|
||
binSampleSizes.set(bin, Math.min(binSampleSize, binSize));
|
||
remainingLineSamples -= binSampleSizes.get(bin);
|
||
}
|
||
|
||
// Distribute remaining samples
|
||
if (remainingLineSamples > 0) {
|
||
const nonEmptyBins = Array.from(groupedByBin.keys());
|
||
for (let i = 0; i < remainingLineSamples && nonEmptyBins.length > 0; i++) {
|
||
const bin = nonEmptyBins[i % nonEmptyBins.length];
|
||
binSampleSizes.set(bin, binSampleSizes.get(bin) + 1);
|
||
}
|
||
}
|
||
|
||
// Sample from each bin
|
||
for (const [bin, binGroup] of groupedByBin) {
|
||
const samples = d3.shuffle([...binGroup]).slice(0, binSampleSizes.get(bin)).map(s => s.data);
|
||
sampled.push(...samples);
|
||
}
|
||
}
|
||
|
||
// Adjust sample size
|
||
let finalSample = sampled;
|
||
if (sampled.length > effectiveSampleSize) {
|
||
finalSample = d3.shuffle(sampled).slice(0, effectiveSampleSize);
|
||
} else if (sampled.length < effectiveSampleSize) {
|
||
const remaining = effectiveSampleSize - sampled.length;
|
||
const additional = d3.shuffle(data.filter(d => !sampled.includes(d))).slice(0, remaining);
|
||
finalSample = [...sampled, ...additional];
|
||
}
|
||
|
||
// Validate and adjust if stats are off
|
||
const sampleStats = computeStats(finalSample);
|
||
const statTolerance = { μ: 0.1, σ: 0.2 }; // Allowable relative deviation
|
||
const needsAdjustment =
|
||
Math.abs(sampleStats.μ[0] - populationStats.μ[0]) / populationStats.μ[0] > statTolerance.μ ||
|
||
Math.abs(sampleStats.μ[1] - populationStats.μ[1]) / populationStats.μ[1] > statTolerance.μ ||
|
||
Math.abs(sampleStats.σ[0] - populationStats.σ[0]) / populationStats.σ[0] > statTolerance.σ ||
|
||
Math.abs(sampleStats.σ[1] - populationStats.σ[1]) / populationStats.σ[1] > statTolerance.σ;
|
||
|
||
if (needsAdjustment) {
|
||
// Add points from underrepresented regions
|
||
const δiSample = finalSample.map(d => d[2]);
|
||
const δiPopulation = data.map(d => d[2]);
|
||
const quantiles = d3.range(0, binsPerLine + 1).map(i => d3.quantile(δiPopulation, i / binsPerLine));
|
||
const sampleBins = d3.histogram().domain(d3.extent(δiPopulation)).thresholds(quantiles)(δiSample);
|
||
const populationBins = d3.histogram().domain(d3.extent(δiPopulation)).thresholds(quantiles)(δiPopulation);
|
||
const underSampledBins = sampleBins
|
||
.map((b, i) => ({ bin: i, diff: populationBins[i].length / totalSize - b.length / finalSample.length }))
|
||
.filter(b => b.diff > 0.1); // Significant under-sampling
|
||
|
||
if (underSampledBins.length > 0) {
|
||
const additionalSamples = [];
|
||
for (const { bin } of underSampledBins) {
|
||
const binData = data.filter(d => d[2] >= quantiles[bin] && d[2] < quantiles[bin + 1] && !finalSample.includes(d));
|
||
const needed = Math.round((underSampledBins[0].diff * effectiveSampleSize) / 2);
|
||
additionalSamples.push(...d3.shuffle(binData).slice(0, needed));
|
||
}
|
||
finalSample = d3.shuffle([...finalSample, ...additionalSamples]).slice(0, effectiveSampleSize);
|
||
}
|
||
}
|
||
|
||
return finalSample;
|
||
}
|
||
|
||
function decimate (data, decimationCount = 20) {
|
||
return data.filter( (row, index) => (index % decimationCount) == 0 );
|
||
}
|
||
|
||
function computeSample (data, opts = {}) {
|
||
const DEFAULT_SAMPLE_SIZE = 2000;
|
||
let sample;
|
||
|
||
if (opts.decimate === true) {
|
||
if (opts.sampleSize > 0) {
|
||
sample = decimate(data.records, Math.floor(data.records.length / opts.sampleSize));
|
||
} else {
|
||
sample = decimate(data.records, Math.floor(data.records.length / DEFAULT_SAMPLE_SIZE));
|
||
}
|
||
} else if (opts.decimate > 0) {
|
||
sample = decimate(data.records, opts.decimate);
|
||
} else if (opts.sampleSize) {
|
||
sample = stratifiedSample(data.records, opt.sampleSize);
|
||
} else {
|
||
sample = stratifiedSample(data.records, DEFAULT_SAMPLE_SIZE);
|
||
}
|
||
|
||
return sample;
|
||
}
|
||
|
||
// Optional: Utility to compute stats for validation
|
||
function computeStats(data) {
|
||
const δi = data.map(d => d[2]);
|
||
const δj = data.map(d => d[3]);
|
||
const rms = Math.sqrt(d3.mean(data, d => d[2] ** 2 + d[3] ** 2));
|
||
return {
|
||
l: data.length,
|
||
μ: [d3.mean(δi), d3.mean(δj)],
|
||
σ: [d3.deviation(δi), d3.deviation(δj)],
|
||
rms
|
||
};
|
||
}
|
||
|
||
function centre (data) {
|
||
const stats = computeStats(data);
|
||
|
||
return data.map( row => [row[0], row[1], row[2]-stats.μ[0], row[3]-stats.μ[1]] )
|
||
}
|
||
|
||
function outliers (data, sd=1.96) {
|
||
const stats = computeStats(data);
|
||
|
||
function fn ([l, p, i, j]) {
|
||
return (i - stats.μ[0]) > stats.σ[0]*sd ||
|
||
(j - stats.μ[1]) > stats.σ[1]*sd;
|
||
}
|
||
|
||
return data.filter(fn)
|
||
}
|
||
|
||
function inliers (data, sd=1.96) {
|
||
const stats = computeStats(data);
|
||
|
||
function fn ([l, p, i, j]) {
|
||
return (i - stats.μ[0]) <= stats.σ[0]*sd &&
|
||
(j - stats.μ[1]) <= stats.σ[1]*sd;
|
||
}
|
||
|
||
return data.filter(fn)
|
||
}
|
||
|
||
function difference (a, b) {
|
||
const obj = Array.isArray(a) ? [] : {};
|
||
for (const k in a) {
|
||
const v0 = a[k];
|
||
const v1 = b[k]
|
||
if (v0 instanceof Object && v1 instanceof Object) {
|
||
obj[k] = difference (v0, v1);
|
||
} else if (!isNaN(Number(v0)) && !isNaN(Number(v1))) {
|
||
obj[k] = v1 - v0;
|
||
}
|
||
}
|
||
return obj;
|
||
}
|
||
|
||
function combinations (a, n) {
|
||
const results = [];
|
||
|
||
function combine(current, start) {
|
||
if (current.length === n) {
|
||
results.push([...current]);
|
||
return;
|
||
}
|
||
for (let i = start; i < a.length; i++) {
|
||
current.push(a[i]);
|
||
combine(current, i + 1);
|
||
current.pop();
|
||
}
|
||
}
|
||
|
||
combine([], 0);
|
||
return results;
|
||
}
|
||
|
||
module.exports = {
|
||
combinations,
|
||
centre,
|
||
ijRMS,
|
||
computeStats,
|
||
computeSample,
|
||
stratifiedSample,
|
||
old_stratifiedSample,
|
||
decimate,
|
||
difference,
|
||
outliers,
|
||
inliers
|
||
}
|