BigQuery で主成分分析を行う方法のメモ。 ここでは、5次元のデータを2次元に次元圧縮します。
■テーブル作成・データ登録
drop table if exists dataset.test_data;
create table dataset.test_data (
id string,
values array
);
insert into dataset.test_data values ('id_1', [1.0, 0.0, 1.0, 3.0, 0.0]);
insert into dataset.test_data values ('id_2', [1.0, 2.0, 1.0, 1.0, 1.0]);
insert into dataset.test_data values ('id_3', [0.0, 2.0, 0.0, 1.0, 1.0]);
insert into dataset.test_data values ('id_4', [1.0, 0.0, 3.0, 2.0, 3.0]);
insert into dataset.test_data values ('id_5', [0.0, 0.0, 0.0, 2.0, 0.0]);
insert into dataset.test_data values ('id_6', [0.0, 0.0, 2.0, 2.0, 4.0]);
insert into dataset.test_data values ('id_7', [1.0, 2.0, 1.0, 1.0, 0.0]);
insert into dataset.test_data values ('id_8', [0.0, 2.0, 3.0, 2.0, 2.0]);
insert into dataset.test_data values ('id_9', [1.0, 2.0, 0.0, 2.0, 0.0]);
insert into dataset.test_data values ('id_10', [1.0, 0.0, 3.0, 1.0, 0.0]);
■モデル作成
create or replace model dataset.test_model
options (
model_type = 'pca'
, num_principal_components = 2
, scale_features = false
, pca_solver = 'full'
)
as (
select
values[0] as f0
, values[1] as f1
, values[2] as f2
, values[3] as f3
, values[4] as f4
, values[5] as f5
from
dataset.test_data
);
■適用
select
*
from
ml.predict(model dataset.test_model,
(
select
id
, values[0] as f0
, values[1] as f1
, values[2] as f2
, values[3] as f3
, values[4] as f4
, values[5] as f5
, values[6] as f6
from
dataset.test_data
),
struct(true as keep_original_columns)
);
[{
"principal_component_1": "-0.69271374229229665",
"principal_component_2": "1.6535767369786081",
"id": "id_1",
"f0": "1.0",
"f1": "0.0",
"f2": "1.0",
"f3": "3.0",
"f4": "0.0"
}, {
"principal_component_1": "0.31854554694562942",
"principal_component_2": "1.5448567976194651",
"id": "id_10",
"f0": "1.0",
"f1": "0.0",
"f2": "3.0",
"f3": "1.0",
"f4": "0.0"
}, {
"principal_component_1": "-0.67889365782004729",
"principal_component_2": "-0.94631539186280522",
"id": "id_2",
"f0": "1.0",
"f1": "2.0",
"f2": "1.0",
"f3": "1.0",
"f4": "1.0"
}, {
"principal_component_1": "-1.2041817682899503",
"principal_component_2": "-1.3374039692570403",
"id": "id_3",
"f0": "0.0",
"f1": "2.0",
"f2": "0.0",
"f3": "1.0",
"f4": "1.0"
}, {
"principal_component_1": "2.6487614707519027",
"principal_component_2": "0.36207867152786921",
"id": "id_4",
"f0": "1.0",
"f1": "0.0",
"f2": "3.0",
"f3": "2.0",
"f4": "3.0"
}, {
"principal_component_1": "-1.3025753320545592",
"principal_component_2": "0.98632696171180989",
"id": "id_5",
"f0": "0.0",
"f1": "0.0",
"f2": "0.0",
"f3": "2.0",
"f4": "0.0"
}, {
"principal_component_1": "2.8720208417866377",
"principal_component_2": "-0.5153230138544187",
"id": "id_6",
"f0": "0.0",
"f1": "0.0",
"f2": "2.0",
"f3": "2.0",
"f4": "4.0"
}, {
"principal_component_1": "-1.4274411393246851",
"principal_component_2": "-0.46000228387475217",
"id": "id_7",
"f0": "1.0",
"f1": "2.0",
"f2": "1.0",
"f3": "1.0",
"f4": "0.0"
}, {
"principal_component_1": "1.3995485642410159",
"principal_component_2": "-0.88215219479355511",
"id": "id_8",
"f0": "0.0",
"f1": "2.0",
"f2": "3.0",
"f3": "2.0",
"f4": "2.0"
}, {
"principal_component_1": "-1.9330707839436485",
"principal_component_2": "-0.40564231419518071",
"id": "id_9",
"f0": "1.0",
"f1": "2.0",
"f2": "0.0",
"f3": "2.0",
"f4": "0.0"
}]
■固有値など
select
*
from
ml.principal_component_info(model dataset.test_model)
;
[{
"principal_component_id": "0",
"eigenvalue": "3.020722723318388",
"explained_variance_ratio": "0.545913745178022",
"cumulative_explained_variance_ratio": "0.545913745178022"
}, {
"principal_component_id": "1",
"eigenvalue": "1.147655631054062",
"explained_variance_ratio": "0.20740764416639679",
"cumulative_explained_variance_ratio": "0.75332138934441883"
}]