167 lines
5.2 KiB
SQL
167 lines
5.2 KiB
SQL
WITH RECURSIVE src AS ( -- BigOh: O(lines)
|
|
select distinct -- This subquery fetches the row with its ID and numeric values that will be clustered
|
|
ROW_NUMBER() OVER(ORDER BY [index] DESC) as id,
|
|
Age as age,
|
|
Height as height,
|
|
Weight as weight,
|
|
Year as year
|
|
from athlete_events
|
|
where Age is not NULL and Height is not NULL and Weight is not NULL and Year is not NULL
|
|
limit 64
|
|
), stdsclaux AS ( -- BigOh: O(lines)
|
|
SELECT -- This subquery gets the MAX and MIN for each column in order to apply standard scale
|
|
max(age) AS maxage,
|
|
min(age) AS minage,
|
|
max(height) AS maxheight,
|
|
min(height) AS minheight,
|
|
max(weight) AS maxweight,
|
|
min(weight) AS minweight,
|
|
max(year) AS maxyear,
|
|
min(year) AS minyear
|
|
from src
|
|
), scrstdscl AS ( -- BigOh: O(lines)
|
|
SELECT -- This subquery applies standard scale
|
|
src.id,
|
|
(0.0 + src.age - stdsclaux.minage) / (stdsclaux.maxage - stdsclaux.minage) AS stdsclage,
|
|
(0.0 + src.height - stdsclaux.minheight) / (stdsclaux.maxheight - stdsclaux.minheight) AS stdsclheight,
|
|
(0.0 + src.weight - stdsclaux.minweight) / (stdsclaux.maxweight - stdsclaux.minweight) AS stdsclweight,
|
|
(0.0 + src.year - stdsclaux.minyear) / (stdsclaux.maxyear - stdsclaux.minyear) AS stdsclyear
|
|
FROM src JOIN stdsclaux ON (1=1)
|
|
), scrstdscllin AS ( -- BigOh: O(lines*attributes)
|
|
SELECT -- This subquery transforms each column value into a virtual vector
|
|
id,
|
|
1 as attr,
|
|
stdsclage as val
|
|
FROM scrstdscl
|
|
UNION ALL
|
|
SELECT
|
|
id,
|
|
2 as attr,
|
|
stdsclheight as val
|
|
FROM scrstdscl
|
|
UNION ALL
|
|
SELECT
|
|
id,
|
|
3 as attr,
|
|
stdsclweight as val
|
|
FROM scrstdscl
|
|
UNION ALL
|
|
SELECT
|
|
id,
|
|
4 as attr,
|
|
stdsclyear as val
|
|
FROM scrstdscl
|
|
), scrstdscldst AS ( -- BigOh: O(POW(lines*attributes, 2))
|
|
SELECT -- This subquery calculates the distance between each distinct element
|
|
c.ida,
|
|
c.idb,
|
|
SQRT(SUM(c.dif2)) AS dist
|
|
FROM (
|
|
SELECT
|
|
a.id AS ida,
|
|
b.id AS idb,
|
|
a.attr AS attr,
|
|
POW(a.val-b.val, 2) AS dif2
|
|
FROM scrstdscllin AS a
|
|
INNER JOIN scrstdscllin AS b
|
|
ON (a.id <> b.id AND a.attr = b.attr)
|
|
) AS c
|
|
GROUP BY c.ida, c.idb
|
|
), scrstdsclgp1 AS ( -- BigOh: O(POW(lines, POW(2, desired_clusters))) ??
|
|
SELECT -- This SLOW subquery tries to solve the combinatory problem of getting the N points that are farthest away between themselves
|
|
-- Think on how to select 1 2 4 3 from the X Y below without taking taking 5 6 7 8 9 as shortcuts
|
|
-- 1 2
|
|
-- 5 6
|
|
-- 9
|
|
-- 7
|
|
-- 8
|
|
-- 3 4
|
|
-- This means discovering from the full cartesian distance graph K_9 the
|
|
-- 4 nodes in which the sum maximizes the sum, without having other intermediaty
|
|
-- point selected, such as in 1 4 2 3, as it's closer for 1 to go to 3 than to 4.
|
|
-- Also, 1 8 5 4 loop must be an invalid sequence as 1 to 5 is smaller than 1 to 8 to 5.
|
|
t1.ida as id1,
|
|
t2.ida as id2,
|
|
t3.ida as id3,
|
|
t4.ida as id4,
|
|
t1.dist + t2.dist + t3.dist + t4.dist + t12.dist + t23.dist as dist,
|
|
t1.dist + t2.dist + t3.dist + t4.dist as peri
|
|
FROM scrstdscldst AS t1
|
|
INNER JOIN scrstdscldst AS t2
|
|
ON (t1.idb = t2.ida) -- t(n-1).b = t(n).a
|
|
INNER JOIN scrstdscldst AS t3
|
|
ON (t2.idb = t3.ida) -- t(n-1).b = t(n).a
|
|
INNER JOIN scrstdscldst AS t4
|
|
ON (t3.idb = t4.ida AND t4.idb = t1.ida) -- t(n-1).b = t(n).a; if last, also t(n).b = t(first).a
|
|
INNER JOIN scrstdscldst AS t12 -- non-perimeter k graph component
|
|
ON (t12.ida = t1.ida AND t12.idb = t2.idb)
|
|
INNER JOIN scrstdscldst AS t23 -- non-perimeter k graph component
|
|
ON (t23.ida = t2.ida AND t23.idb = t3.idb)
|
|
WHERE 1=1
|
|
AND t1.ida <> t2.ida -- ensuring output idX will be distinct
|
|
AND t1.ida <> t3.ida -- ensuring output idX will be distinct
|
|
AND t1.ida <> t4.ida -- ensuring output idX will be distinct
|
|
AND t2.ida <> t3.ida -- ensuring output idX will be distinct
|
|
AND t2.ida <> t4.ida -- ensuring output idX will be distinct
|
|
AND t3.ida <> t4.ida -- ensuring output idX will be distinct
|
|
ORDER BY dist DESC, peri ASC LIMIT 1 -- only the largest distance with smallest perimeter
|
|
), scrstdsclgrv AS ( -- BigOh(desired_clusters)
|
|
SELECT -- this subquery transforms each group representative back into virtual vector form
|
|
1 as cl,
|
|
v.id,
|
|
v.attr,
|
|
v.val
|
|
FROM scrstdsclgp1 AS g
|
|
INNER JOIN scrstdscllin AS v
|
|
ON (v.id = g.id1)
|
|
UNION ALL
|
|
SELECT
|
|
2 as cl,
|
|
v.id,
|
|
v.attr,
|
|
v.val
|
|
FROM scrstdsclgp1 AS g
|
|
INNER JOIN scrstdscllin AS v
|
|
ON (v.id = g.id2)
|
|
UNION ALL
|
|
SELECT
|
|
3 as cl,
|
|
v.id,
|
|
v.attr,
|
|
v.val
|
|
FROM scrstdsclgp1 AS g
|
|
INNER JOIN scrstdscllin AS v
|
|
ON (v.id = g.id3)
|
|
UNION ALL
|
|
SELECT
|
|
4 as cl,
|
|
v.id,
|
|
v.attr,
|
|
v.val
|
|
FROM scrstdsclgp1 AS g
|
|
INNER JOIN scrstdscllin AS v
|
|
ON (v.id = g.id4)
|
|
), scrstdsclgrpdst AS ( -- BigOh(line*desired_clusters*attributes)
|
|
SELECT -- This subquery calculates the distance between each distinct element
|
|
c.id,
|
|
c.cl,
|
|
SQRT(SUM(c.dif2)) AS dist
|
|
FROM (
|
|
SELECT
|
|
a.id,
|
|
b.cl,
|
|
a.attr AS attr,
|
|
POW(a.val-b.val, 2) AS dif2
|
|
FROM scrstdscllin AS a
|
|
INNER JOIN scrstdsclgrv AS b
|
|
ON (a.attr = b.attr)
|
|
) AS c
|
|
GROUP BY c.id, c.cl
|
|
), scrstdsclgrpclf AS ( -- BigOh(lines*desired_clusters)
|
|
SELECT -- This subquery calculates the distance between each distinct element
|
|
src.*,
|
|
(SELECT cl FROM scrstdsclgrpdst AS g WHERE g.id = src.id ORDER BY dist ASC LIMIT 1) AS clf
|
|
FROM src
|
|
)
|
|
SELECT * FROM scrstdsclgrpclf
|