41 lines
2.1 KiB
C#
41 lines
2.1 KiB
C#
using System;
|
|
using System.Collections;
|
|
using System.Linq;
|
|
|
|
public static class ClusterRelabeler
|
|
{
|
|
public static int[] PrevalenceByCluster(this (int[] clusters, string?[] label, string[] labels) tpl, bool compensate = false)
|
|
{
|
|
var (clusters, label, labels) = tpl;
|
|
var nClusters = clusters.Max() + 1;
|
|
var nLabels = labels.Length;
|
|
int[][] clusterLabelFrequency = (nClusters, nLabels).Allocate2D(0);
|
|
for (int s = 0; s < clusters.Length; s++)
|
|
if (label[s] != null)
|
|
clusterLabelFrequency[clusters[s]][labels.IndexOf(label[s])]++;
|
|
int[] clusterLabel = nClusters.Allocate1D(0);
|
|
for (int k = 0; k < nClusters; k++)
|
|
clusterLabel[k] = clusterLabelFrequency[k].IndexOfMax();
|
|
if (!compensate)
|
|
return clusterLabel;
|
|
int[] clusterFrequency = clusterLabelFrequency.Select(x => x.Sum()).ToArray();
|
|
int[] labelFrequency = clusterLabelFrequency.Transpose().Select(x => x.Sum()).ToArray();
|
|
bool[] clusterHeld = nClusters.Allocate1D(false);
|
|
var clusterLabelDistinct = clusterLabel.Distinct().ToArray();
|
|
if (clusters.Distinct().Count() > labels.Length) // don't violate the pidgeonhole principle
|
|
while (clusterLabelDistinct.Length < labels.Length)
|
|
{
|
|
int[] absents = Enumerable.Range(0, nLabels).Where(x => !clusterLabel.Contains(x)).ToArray();
|
|
int leastFrequentAbscense = absents.MinBy(x => labelFrequency[x]);
|
|
double[] maxFreqClusters = clusterLabelFrequency.Select(x => ((double)x[leastFrequentAbscense]) / labelFrequency[leastFrequentAbscense]).ToArray();
|
|
for (int i = 0; i < nClusters; i++)
|
|
if (clusterHeld[i])
|
|
maxFreqClusters[i] = double.NegativeInfinity;
|
|
int maxFreqCluster = maxFreqClusters.IndexOfMax();
|
|
clusterHeld[maxFreqCluster] = true;
|
|
clusterLabel[maxFreqCluster] = leastFrequentAbscense;
|
|
clusterLabelDistinct = clusterLabel.Distinct().ToArray();
|
|
}
|
|
return clusterLabel;
|
|
}
|
|
} |