#==========
Julia の修行をするときに,いろいろなプログラムを書き換えるのは有効な方法だ。
以下のプログラムを Julia に翻訳してみる。
Python による統計処理
http://aoki2.si.gunma-u.ac.jp/Python/PythonDir.tar.gz
ポリシリアル相関係数
http://aoki2.si.gunma-u.ac.jp/Python/polyserial.pdf
ファイル名: polyserial.jl 関数名: polyserial
翻訳するときに書いたメモ
Julia には(まだ?) pmvnorm() がないので,R の pmvnorm() を呼ぶ。
optimize() も,hessian を明示的には得られないので,トリッキーなことをする(方法が,インターネットから得られた)。
==========#
using Rmath, Statistics, Optim, Printf # LinearAlgebra
function polyserial(x, y; twostep=false, bins=4)
function f(pars)
rho = pars[1]
if length(pars) == 1
cts = vcat(-Inf, rowCuts, Inf)
else
temp = pars[2:end]
if any(diff(temp) .< 0)
return Inf
end
cts = vcat(-Inf, temp, Inf)
end
tau = -(rho .* z .- cts') ./ sqrt(1 - rho^2)
choose = pnorm.(tau[i, y[i]+1] for i = 1:n)
choose2 = pnorm.(tau[i, y[i]] for i = 1:n)
-sum(log.(dnorm.(z) .* (choose .- choose2)))
end
method = "Polyserial Correlation: " * (twostep ? "two step " : "") * "ML estimator"
z = zscore(x)
val, tab = table(y)
n = sum(tab)
s = length(tab)
rowCuts = qnorm.(cumsum(tab) ./ n)[1:s-1]
initval = std(y, corrected=false) * cor(x, y) / sum(dnorm.(rowCuts))
if twostep
result = optimize(f, [initval], BFGS())
rho = Optim.minimizer(result)
SE = sqrt(invH)
else # pars = vcat(initval, rowCuts)
result = optimize(f, vcat(initval, rowCuts), BFGS())
x = Optim.minimizer(result)
rho, rowCuts = x[1], x[2:end]
hes = sqrt.(invH[i, i] for i = 1:s)
SE, ser = hes[1], hes[2:end]
end
chisq = chisqfunc(y, z, rho, rowCuts, bins)
df = s * bins - s - 1
pvalue = pchisq(chisq, df, false)
println("$method\n ML estimator = $rho, Std.Err. = $SE")
println("Test of bivariate normality:\n chisq = $chisq, df = $df, p value = $pvalue")
if !twostep
@printf("%13s %9s\n", "Treshold", "Std. Err.")
for i = 1:length(rowCuts)
@printf("%2d %9.6f %9.6f\n", i, rowCuts[i], ser[i])
end
end
Dict(:method => method, :rho => rho, :SE => SE, :chisq => chisq, :df => df, :pvalue => pvalue, :rowCuts => rowCuts)
end
Optim.after_while!(d, state::Optim.BFGSState, method::BFGS, options) = global invH = state.invH
function binBvn(rho, rowCuts, colCuts)
rowCuts = vcat(-Inf, rowCuts, Inf)
colCuts = vcat(-Inf, colCuts, Inf)
r = length(rowCuts) - 1
c = length(colCuts) - 1
P = zeros(r, c)
S = [1 rho; rho 1]
mu = zeros(2)
for i = 1:r
for j = 1:c
P[i, j] = pmvnorm([rowCuts[i], colCuts[j]],
[rowCuts[i + 1], colCuts[j + 1]], mu, S)
end
end
P
end
using RCall
pmvnorm(lower, upper, mu, S) = rcopy(R"library(mvtnorm); pmvnorm($lower, $upper, $mu, $S)")
zscore(x) = (x .- mean(x)) / std(x, corrected=false)
function table(x) # indices が少ないとき
indices = sort(unique(x))
counts = zeros(Int, length(indices))
for i in indexin(x, indices)
counts[i] += 1
end
return indices, counts
end
function table(x, y) # 二次元
indicesx = sort(unique(x))
indicesy = sort(unique(y))
counts = zeros(Int, length(indicesx), length(indicesy))
for (i, j) in zip(indexin(x, indicesx), indexin(y, indicesy))
counts[i, j] += 1
end
return indicesx, indicesy, counts
end
function chisqfunc(y, z, rho, rowCuts, bins)
colCuts = qnorm.((1:bins-1) ./ bins)
P = binBvn(rho, rowCuts, colCuts)
catz = [sum(w .> vcat(-Inf, colCuts, Inf)) for w in zscore(z)]
indexx, indexy, tab = table(y, catz)
nonzero = tab .!= 0
n = length(y)
2sum(tab[nonzero] .* log.(tab[nonzero] ./ (P[nonzero] .* n)))
end
x = [0.7492, -0.22295, 0.11472, 0.53715, -0.51242, 0.35807, 0.49264,
-0.51353, -0.94197, 1.15984, 1.12984, -1.02436, -1.07608, -0.30467,
0.54926, 1.35279, 2.40187, 0.37274, -0.74321, 1.71418, 0.47398,
-0.78159, 1.2034, -1.21809, 0.22509, -0.01787, 0.14278, -0.57617,
0.88083, 1.46454, -0.20298, 0.94596, -1.04114, 1.2703, 0.1639,
-0.15033, -0.41042, 1.59887, 0.58806, 0.72434, 0.89338, 0.34713,
1.42137, 0.56944, 0.73173, -1.15237, 1.72124, -0.76932, 0.07062,
-0.75912, -0.08296, 0.06515, -0.00246, 0.10176, -0.34965, -1.53095,
1.55971, -0.20873, 0.11238, 2.08786, 1.83424, 0.70741, -0.70681,
-1.48295, -2.36075, 1.81494, 0.20038, 2.29407, -0.80354, 0.0154,
-1.43625, -0.96287, 0.15512, -1.27466, 0.90138, -1.42222, -0.02011,
-0.16987, 1.55233, -0.10292, -0.39256, -0.35841, -1.03065, 0.29056,
1.69807, 0.54179, 0.24826, -1.52018, 2.50281, -0.2621, -0.89337,
-0.09901, 0.80746, 1.50104, -0.29506, 0.39425, 0.83788, 1.47223,
0.70706, -0.42811]
y = [3, 2, 1, 2, 2, 4, 3, 3, 2, 3, 4, 1, 3, 2, 2, 4, 4, 3, 1, 2,
1, 3, 4, 2, 1, 4, 2, 2, 3, 1, 1, 4, 1, 4, 1, 2, 4, 3, 2, 3, 3,
4, 4, 2, 3, 1, 2, 2, 4, 2, 1, 1, 2, 3, 2, 1, 4, 2, 3, 2, 4, 4,
1, 3, 1, 3, 1, 2, 2, 2, 1, 1, 3, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
1, 2, 3, 3, 2, 3, 1, 2, 3, 2, 3, 2, 2, 2, 3, 2, 2]
polyserial(x, y, twostep=true)
# Polyserial Correlation: two step ML estimator
# ML estimator = [0.46699921329233146], Std.Err. = [0.07931271523690714]
# Test of bivariate normality:
# chisq = 5.488940449811883, df = 11, p value = 0.9051999325887348
polyserial(x, y)
# Polyserial Correlation: ML estimator
# ML estimator = 0.46619488420452654, Std.Err. = 0.08060954580245325
# Test of bivariate normality:
# chisq = 5.484065294038735, df = 11, p value = 0.9054810695989302
# Treshold Std. Err.
# 1 -0.845491 0.133867
# 2 0.305882 0.118075
# 3 1.042709 0.144813
polyserial(x, y, bins=6)
# Polyserial Correlation: ML estimator
# Polyserial Correlation:
# ML estimator = 0.46619488420452654, Std.Err. = 0.08060954580245325
# Test of bivariate normality:
# chisq = 12.914187782275436, df = 19, p value = 0.8429348531100296
# Treshold Std. Err.
# 1 -0.845491 0.133867
# 2 0.305882 0.118075
# 3 1.042709 0.144813