cskmeans/Program.cs

110 lines
6.5 KiB
C#

using System;
using System.IO;
using System.Linq;
public class Program
{
public static void Main()
{
var boot = DateTime.Now;
var kBgn = 2;
var kQty = 10;
var mDig = 5;
var oTests = 100;
var start = DateTime.Now;
Console.WriteLine("Started " + start.ToString() + " (" + (start - boot).TotalSeconds + "s)");
var (rawIdentities, rawLabels, rawData) = SWITRS.LoadDataSet();
// identities[lin] ; labels[lin] ; values[col][lin]
string?[] labels = rawLabels.Distinct().Order().ToArray();
string[] labelsNN = labels.WhereNotNull().ToArray();
var loaded = DateTime.Now;
Console.WriteLine("Loaded " + loaded.ToString() + " (" + (loaded - start).TotalSeconds + "s, DatasetLines=" + rawData[0].Length + ")");
var (shuffledRawIdentities, shuffledRawLabels, shuffledRawData) = (rawIdentities, rawLabels, rawData).Shuffle(42);
var shuffled = DateTime.Now;
Console.WriteLine("Shuffled " + shuffled.ToString() + " (" + (shuffled - loaded).TotalSeconds + "s)");
var (scalers, scaledData) = rawData.Scaled(Scalers.MaxMinScale);
var scaled = DateTime.Now;
Console.WriteLine("Scaled " + scaled.ToString() + " (" + (scaled - shuffled).TotalSeconds + "s)");
var (
(idTrain, lbTrain, dsTrain),
(idTest, lbTest, dsTest),
(idValidation, lbValidation, dsValidation)
) = (shuffledRawIdentities, shuffledRawLabels, scaledData).SplitTTV(96, 2, 2, 8, true);
if (dsTrain.Length <= 0 || idTrain.Length <= 0)
throw new Exception("Reduce the number of SplitTTV subdivisions");
var dsCentroid = KMeans.FromData(1, dsTrain).Centroids[0];
Console.WriteLine("> TrainCentroids=[" + string.Join(",", dsCentroid) + "]");
var splitTTV = DateTime.Now;
Console.WriteLine("SplitTTV " + splitTTV.ToString() + " (" + (splitTTV - scaled).TotalSeconds + "s, Train=" + dsTrain[0].Length + ", Test=" + dsTest[0].Length + ", Validation=" + dsValidation[0].Length + ")");
var kmeanss = new KMeans[kQty];
for (int k = 0; k < kQty; k++)
kmeanss[k] = new KMeans(kBgn + k, dsTrain.Length);
for (int k = 0; k < kQty; k++)
kmeanss[k].Fit(dsTrain);
var fit = DateTime.Now;
Console.WriteLine("" + kQty + "x Fits " + fit.ToString() + " (" + (fit - splitTTV).TotalSeconds + "s)");
var labeleds = new int[kQty][];
var distances = new double[kQty][];
for (int k = 0; k < kQty; k++)
(labeleds[k], distances[k]) = kmeanss[k].PredictWithDebug(dsTest);
Console.WriteLine("> WSSs=[" + string.Join(",", distances.Select(x => Math.Round(x.DistancesToWSS(), mDig))) + "]");
var test = DateTime.Now;
Console.WriteLine("" + kQty + "x Tests " + test.ToString() + " (" + (test - fit).TotalSeconds + "s)");
var silhouettes = new double[kQty];
for (int k = 0; k < kQty; k++)
silhouettes[k] = Math.Round((labeleds[k], dsTest).Silhouette("K=" + (k + kBgn)), mDig);
Console.WriteLine("> SILs=[" + string.Join(",", silhouettes) + "]");
var composite = new double[kQty];
for (int k = 0; k < kQty; k++)
composite[k] += Math.Round(silhouettes[k] - Math.Round(distances[k].DistancesToWSS(), mDig), mDig);
Console.WriteLine("> AGGs=[" + string.Join(",", composite) + "]");
var silhouette = DateTime.Now;
Console.WriteLine("" + kQty + "x Silhouettes " + silhouette.ToString() + " (" + (silhouette - test).TotalSeconds + "s)");
var bestk = composite.IndexOfMax();
var bestKMeans = kmeanss[bestk];
var (valClusters, valWssV) = bestKMeans.PredictWithDebug(dsValidation);
int[] prevalenceTrue = (valClusters, lbValidation, labelsNN).PrevalenceByCluster();
int[] prevalenceCompensated = (valClusters, lbValidation, labelsNN).PrevalenceByCluster(true);
var valWss = valWssV.DistancesToWSS();
var valSil = (valClusters, dsValidation).Silhouette("validation");
var outlierScoress = new double[oTests][];
var outlierScoressd = new double[oTests];
for (int o = 0; o < oTests; o++)
outlierScoress[o] = (
bestKMeans.CloneWithExtraCentroid(dsCentroid).PredictWithDebug(dsValidation).Item2,
valWss, valSil).OutlierScores(o / 10d);
for (int o = 0; o < oTests; o++)
outlierScoressd[o] = Math.Abs(.5d - outlierScoress[o].Average());
var oMin = outlierScoressd.IndexOfMin();
var outlierScoreMax = outlierScoress[oMin].Max();
var outlierScoreMin = outlierScoress[oMin].Min();
var outlierScoreAvg = outlierScoress[oMin].Average();
Console.WriteLine("> OutlierScore(o=" + (oMin / 10d) + ", Min=" + outlierScoreMin + ", Avg=" + outlierScoreAvg + ", Max=" + outlierScoreMax + ")");
Console.Write("Scalers=");
scalers.PrettyPrintScalers();
Console.Write("Centroids=");
bestKMeans.PrettyPrintCentroid();
Console.Write("DescaledCentroids=");
bestKMeans.PrettyPrintDescaledCentroid(scalers);
Console.WriteLine("Prevalence=[" + string.Join(",", prevalenceTrue) + "]");
Console.WriteLine("CompensatedPrevalence=[" + string.Join(",", prevalenceCompensated) + "]");
Console.WriteLine("bestK=" + (bestk + kBgn) + " clusters");
Console.WriteLine("WSS=" + valWss);
Console.WriteLine("Sil=" + valSil);
Console.WriteLine("Agg=" + (valSil - valWss));
var validation = DateTime.Now;
Console.WriteLine("Validation " + validation.ToString() + " (" + (validation - silhouette).TotalSeconds + "s)");
File.WriteAllText(
"model-as-a-db-row.txt",
start.ToIsoString() + "@" + validation.ToIsoString() + "@" + (kBgn + bestk) + "@" +
string.Join(";", labelsNN) + "@" + string.Join(";", prevalenceTrue) + "@" +
string.Join(";", prevalenceCompensated) + "@" + (oMin / 10d) + "@" +
valWss + "@" + valSil + "@" + (valSil - valWss) + "@" + string.Join(":", dsCentroid) + "@" +
string.Join(";", scalers.Select(x => x.Item1 + ":" + x.Item2)) + "@" +
string.Join(";", bestKMeans.Centroids.Select(x => string.Join(":", x)))
);
var done = DateTime.Now;
Console.WriteLine("All Done! " + done.ToString() + " (" + (done - validation).TotalSeconds + "s; Total=" + (done - boot).TotalSeconds + "s)");
}
}