2013年10月のブログ記事一覧-裏 RjpWiki

send+more=money その4

2013年10月22日 | ブログラミング

# 漢字かななども使える（1 文字も可）バージョン
smm = function(str) {
    str = gsub("[ \t]", "", str)
    str = unlist(strsplit(str, ""))
    if (grepl(str[1], "+-")) {
       stop("Unary minus or plus is invalid.")
    }
    str = c(str, ".")
    nChar = length(str)
    operatorPosition = NULL
    for (i in seq_len(nChar)) {
        if (match(str[i], c("+", "-", "=", "."), 0) != 0) {
            operatorPosition = c(operatorPosition, i)
        }
    }
    nTerm = length(operatorPosition)
    word = vector("list", nTerm)
    op = rep("+", nTerm + 1)
    start = 1
    for (i in seq_len(nTerm)) {
        position = operatorPosition[i]
        word[[i]] = str[start:(position - 1)]
        op[i + 1] = str[position]
        start = position + 1
    }
    charSet = unique(unlist(word))
    if (length(charSet) > 10) {
        stop("Too many characters.")
    }
    library(e1071)
    perm = t(e1071::permutations(10) - 1)
    term = matrix(0, nTerm, ncol(perm))
    nonZeroTop = rep(TRUE, ncol(perm))
    for (i in seq_len(nTerm)) {
        subscript = match(word[[i]], charSet)
        term[i, ] = colSums(10^(length(subscript):1-1) * matrix(perm[subscript, ], ncol=ncol(perm)))
        if (op[i] == "-") {
            term[i, ] = -term[i, ]
        }
        nonZeroTop = nonZeroTop & perm[subscript[1], ] != 0
    }
    leftsideTerm = colSums(term[1:(nTerm - 1), ])
    answerFlag = leftsideTerm == term[nTerm, ] & nonZeroTop
    ans = unique(t(term[, answerFlag]))
    if (nrow(ans) == 0) {
        "No solution."
    } else if (nrow(ans) == 1) {
       op = op[-nTerm-1]
       op[1] = ""
        cat(gsub("--", "-", paste(op, ans, collapse="", sep="")))
    } else {
        ans
    }
}

> smm("すももと+ももとは=ともだちだ")
9771+7713=17484

> smm("言葉たす+言葉たす言葉+言葉をたす=ふくめんざん")
6519+651965+65819=724303

> smm("ははの+ははは+ばあば+ちちの+ちちは=じいじ") # 自作
226+222+303+116+112=979

頑張ってます。クリックお願いします。

send+more=money その3

2013年10月21日 | ブログラミング

R の for の特性を生かせば，f5 は f6 のようにできる。そして，f6 は f5 より速い。あっという間に答えが出る。

関数の中で base::setdiff を最適化したものを使うようにしている。そのようにしないと（base::setdiff を使うと）むしろ f5 より遅くなる。

f6 = function() {
   setdiff = function (x, y) {
       x[match(x, y, 0L) == 0L]
   }
    m = 1L
    o = 0L
    domain = 2:9
    for (s in domain) {
        domain2 = setdiff(domain, s)
        for (e in domain2) {
            domain3 = setdiff(domain2, e)
            for (n in domain3) {
                domain4 = setdiff(domain3, n)
                for (d in domain4) {
                    domain5 = setdiff(domain4, d)
                    for (r in domain5) {
                        domain6 = setdiff(domain5, r)
                        for (y in domain6) {
                            if (sum(10^(3:0)*c(s+m, e+o, n+r, d+e)) == sum(10^(4:0)*c(m, o, n, e, y))) {
                                cat(sprintf("%s%s%s%s + %s%s%s%s = %s%s%s%s%s\n",
                                            s, e, n, d, m, o, r, e, m, o, n, e, y))
}}}}}}}}

頑張ってます。クリックお願いします。

send+more=money その2

2013年10月21日 | ブログラミング

for なんか使うと遅いだろうと思うだろうが，制約条件をせばめてやれば，むしろ他の方法より速くなる。

f5 = function() {
    m = 1
    o = 0
    for (s in 2:9) {
        for (e in 2:9) {
            if (e == s) next
            for (n in 2:9) {
                if (n == s || n == e) next
                for (d in 2:9) {
                    if (d == s || d == e || d == n) next
                    for (r in 2:9) {
                        if (r == s || r == e || r == n || r == d) next
                        for (y in 2:9) {
                            if (y == s || y == e || y == n || y == d || y == r) next
                            if (sum(10^(3:0)*c(s+m, e+o, n+r, d+e)) == sum(10^(4:0)*c(m, o, n, e, y))) {
                                cat(sprintf("%s%s%s%s + %s%s%s%s = %s%s%s%s%s\n",
                                            s, e, n, d, m, o, r, e, m, o, n, e, y))
}}}}}}}}

> benchmark(f2(), f3(), f5(), replications=2)
test replications elapsed relative user.self sys.self user.child sys.child
1 f2()            2 24.983   32.488    23.649    1.541          0         0
2 f3()            2   8.835   11.489     6.934    1.980          0         0
3 f5()            2   0.769    1.000     0.755    0.005          0         0

頑張ってます。クリックお願いします。

send+more=money

2013年10月20日 | ブログラミング

元記事の関数の実行時間のほぼ90％は，permutations が占める。

なので，残りの部分を凝ってみても速度向上は望めないので，せめて分かりやすく書いてみるか。

f2 = function() {
   library(gtools)
   perm = t(gtools::permutations(10, 8))-1
   perm = perm[, perm[1, ] != 0 & perm[5,] != 0]
   t1 = colSums(10^(3:0)*perm[1:4, ])
   t2 = colSums(10^(3:0)*perm[c(5, 6, 7, 2), ])
   t3 = colSums(10^(4:0)*perm[c(5, 6, 3, 2, 8), ])
   return(perm[, perm[5, ] != 0 & t1+t2 == t3])
}

なお，e1071 にも，permutations があるが，こちらは速い。ただ，関数仕様が nPm ではなく nPn を計算するようになっているので，重複する結果の表示を許すか，さもなくば，最終結果を取り出すところがちょっと醜いプログラムになる。

f3 = function() {
   library(e1071)
   perm = t(e1071::permutations(10)[, 1:8])-1
   perm = perm[,perm[1,] != 0 & perm[5, ] != 0]
   t1 = colSums(10^(3:0)*perm[1:4, ])
   t2 = colSums(10^(3:0)*perm[c(5, 6, 7, 2), ])
   t3 = colSums(10^(4:0)*perm[c(5, 6, 3, 2, 8), ])
   return(unique(t(perm[, perm[5, ] != 0 & t1+t2 == t3])))
}

f3 は f2 より 4 倍ほど速い。

おまけ

# DONALD + GERALD = ROBERT
f4 = function() {
    library(e1071)
perm = t(e1071::permutations(10))-1
    perm = perm[,perm[1,] != 0 & perm[5, ] != 0]
    t1 = colSums(10^(5:0)*perm[c(1, 2, 3, 4, 5, 1), ])
    t2 = colSums(10^(5:0)*perm[c(6, 7, 8, 4, 5, 1), ])
    t3 = colSums(10^(5:0)*perm[c(8, 2, 9, 7, 8, 10), ])
    ans = t1+t2 == t3
return(c(t1[ans], t2[ans], t3[ans]))
}

頑張ってます。クリックお願いします。

列ごとに同じ検定を繰り返すプログラム

2013年10月17日 | ブログラミング

某所に掲載されていたプログラム

以下のようにするとコンパクト

KrusRep <- function(dat) {
x <- ncol(dat)
mat <- matrix(sapply(dat[, -x],
function(z) kruskal.test(z ~ dat[, x])$statistic), ncol=1)
dimnames(mat) <- list(colnames(dat[, -x]), "chi-squared")
return(mat)
}

頑張ってます。クリックお願いします。

確率の問題をシミュレーションで解く

2013年10月16日 | ブログラミング

斎藤さんには二人の子供がいる。日曜日生まれの女の子はいるかと聞くと、いると言う。
では、もう一人も女の子である確率は？

分かりやすくするために，冗長に書く

n = 100000000
d = data.frame(mf1=sample(2, n, replace=TRUE), mf2=sample(2, n, replace=TRUE),
dw1=sample(7, n, replace=TRUE), dw2=sample(7, n, replace=TRUE))
d2 = d[(d$mf1 == 2 & d$dw1 == 1) | (d$mf2 == 2 & d$dw2 == 1),]
sum(d2$mf1 == 2 & d2$mf2 == 2)/nrow(d2)
13/27 # 理論値

mf1，mf2 は性別（2 が女），dw1，dw2 は生まれ曜日（1 が日曜日）

頑張ってます。クリックお願いします。

とある弁当屋の...

2013年10月11日 | 統計学

石田基弘著「とある弁当屋の統計技師-データ分析のはじめかた」共立出版

読んでみました。

出版社の校正係，もっとしっかり仕事しなさい

ii ページ

コンピューター　→　普通は，「コンピュータ」
一日キーボードを叩きながら　→　一日中キーボードを叩きながら
「ハッカー」という言葉の本来の意味と違った使い方がされている

4 ページ

若ければなんとかなるのだろう　→　若ければなんとかなるだろう
数ヶ月前　→　数か月前

6 ページ

つま先には下駄が引っ掛けられ　→　足には下駄を突っ掛け

13 ページ

統計要約量　→　要約統計量

14, 15, 17 ページ

計算式の分子の括弧は不要

16, 18, 23, 24, 46 ページなど数多くある

並びかえる　→　並べかえる

16 ページ

中央値を尺度と呼ぶのは不適

22 ページ

英語では boxplot といいます　→　英語では box and whisker plot といいます

25 ページ

四分位範囲　→　四分範囲（検索で多数派。また 46 ページでは四分範囲としている）

28 ページ（他にも何箇所か。たとえば 46 ページに同じような説明）

第1四分位数から第3四分位数の範囲を『四分位範囲』という
↓
第3四分位数-第1四分位数を『四分範囲』という

38 ページ

こっちがお金はらうから　←　どのような場合でも客は金をはらうモノ

40 ページ

壁に押されるように　→　壁に押しつけられるように（？？）

52 ページ

お弁当の売上金額に限らず，データの平均値とバラツキというのは，平均値を中心に左右対称に散らばっているという特徴があります
↑
それは言い過ぎ。いつも，どんなものでもではない。

55 ページ

専門的には標本平均値　→　専門的には標本平均（というほうが多い？？）

62 ページ

標本のこうした性質を，データ分析では「中心極限定理」といいます
↓
この定義は正確ではない

63 ページ

獄門縛り首　→　獄門（囚人の首を牢獄の門や刑場の木架の上などにさらし，見せしめにすること。獄門のあと首を縛っても，2回は死ねない。）

65 ページ

二項分布の図は棒グラフで示すべき。分布関数（累積度数分布関数）なら階段グラフでよいが。

68 ページ

白いズキン　→　乱子は頭に「三角巾」をつけていたはず（6 ページ），もっとも，イラストではいつも頭には何もつけていないようだ

77 ページの他何箇所も

ノーパソ

私は，品性下劣なので，「ノーパン」と読んでしまい，目を疑った(^_^)
そんな省略しないで，ノートパソコンって書いてください

91 ページ

気温と消費量（リットル）の共分散
↓
気温と消費量（ミリリットル）の共分散
（著者のサポートページで訂正済み）

98 ページ

点から直線に垂直の線を引くんです。これを垂線といいます。
↓
データを表す点から回帰直線まで、Y軸に平行に直線を引きます。

これは著者のサポートページで訂正済み

以下は未訂正

この垂線たちが最も短くなるように
↓
この垂線たちの長さの自乗和が最も小さくなるように
（正確に言えば）

99 ページ

図中の
データと直線を垂線　→　データと回帰直線
著者のサポートページで，訂正済み

102 ページ

ズボンからハンカチを引っ張り出して
↓
ズボンのポケットからハンカチを引っ張り出して
（でないと，不潔っぽいよね）

105 ページ

スチューデント(Student)さんの綴りから，最後のtを取ったそうですけど

まあ，二項文太君が言うだけなのでよいけど，本当はそうじゃないはず

「偽名」というのも「ペンネーム」と書く方がよさそう。

108 ページ他何箇所も

回帰係数が0.19078と推定される確率を求める
↓
回帰係数の絶対値が0.19078以上になる確率を求める
（複雑になろうが正確に書くべし）

109 ページ

データから0とは異なる係数が推定される確率
↓
正確に書くべし

177, 178, 179 ページ

log は立体で書かないといけない（数式における関数名一般の書式）

193 ページ

(14/25) * (12/25) 　→　(11/25) * (12/25)

著者のサポートページにて訂正済み

209 ページ

分割表の中に5未満の数値が1つでも含まれている場合
↓
分割表の期待値に5未満の数値が1つでも含まれている場合

頑張ってます。クリックお願いします。

ダメ出し：統計学の問題を作るのも難しいもので...

2013年10月11日 | 統計学

CodeIQで「『データサイエンティスト養成読本』著者陣さんの問題」やってみたけど…
http://m884.hateblo.jp/entry/2013/09/19/171218

なんですけどね。どっちもどっち？？？

問題文は孫引き（大元の記事は参照できなくなっている）だが，

問1. Rに標準装備されているあやめ（Iris）のデータを用いて、あやめのがく片の長さ（Sepal.Length）とがく片の幅（Sepal.Width）の相関分析を行いました。

    【Rの実行結果】

    上記の【Rの実行結果】を参考にし、次の中から正しいものを一つ選んでください。

    a) LengthとWidthの間の相関係数r=-0.11757となっているため、Lengthが大きいほど、Widthが小さくなるといえる
    b) 相関係数ｒの検定結果でｐ値が0.1519と有意水準0.05(5％)より大きくなっているため、２変数間に有意な相関があるとはいえない
    c) aとbで統計量を精査しているため、データをプロットして散布図を作成することに意味はない

R の実行結果は示されていないが，

    > cor.test(iris$Sepal.Length, iris$Sepal.Width)
   Pearson's product-moment correlation
    data: iris$Sepal.Length and iris$Sepal.Width
    t = -1.4403, df = 148, p-value = 0.1519
    alternative hypothesis: true correlation is not equal to 0
    95 percent confidence interval:
     -0.27269325 0.04351158
    sample estimates:
           cor
    -0.1175698

でもあろうか。

ブログ筆者：正解はbらしいのですが、私はaと答えました。なんでaじゃないのかちょっとよくわかりません。なんか根本的に勘違いしているんでしょうか…。

出題者のレビュー：aは、相関係数の理解が誤っています。
相関係数は-1から1の値をとり、その符号で相関関係の方向性を表し、絶対値の大きさで相関関係の強さを示します。
一般的には次のように言われることが多いです。
|r|=0.7～1：強い相関あり
|r|=0.4～0.7：やや相関あり
|r|=0～0.2：ほとんど相関なし

b については，

ブログ筆者：bが正解となっていますが、微妙なラインですが私は誤りだと思います。「有意水準0.05(5％)より大きくなっているため」というのは有意水準を所与のものとしている書き方だと思います。「有意水準を5％に定めた場合は」とすべきでしょう。

=======================

c での散布図に関しての書きぶりからは，「R の実行結果」に散布図は示されていないのではないかと思った次第だが，散布図は以下のようになる。

plot(iris$Sepal.Length, iris$Sepal.Width, cex=0.5)

残念ながら，このような単純な散布図では，「負の相関で相関係数は -0.11757」ということしか分からないだろう。

しかし，iris データについては，iris$Species を適切に評価しないと正しい分析は出来ないことはよく知られていると思うのだが。知らなくても，注意深い人は左上に位置するデータの様相が気になるかもしれない。

iris$Species を考慮した，以下のような「適切な散布図」を描けば，iris データの真相に迫れるだろう。

plot(iris$Sepal.Length, iris$Sepal.Width, cex=0.5,
col=rep(c("#aa000060", "#00aa0060", "#0000aa60"), each=50),
pch=rep(c(15, 16, 17), each=50))

つまり，Sepal.Length と Sepal.Width の関連の程度は Species により若干は異なるものの，二変数には正の相関があるということである。

> lapply(split(iris, iris$Species), function(d) cor.test(d$Sepal.Length, d$Sepal.Width))
$setosa

   Pearson's product-moment correlation

data: d$Sepal.Length and d$Sepal.Width
t = 7.6807, df = 48, p-value = 6.71e-10
alternative hypothesis: true correlation is not equal to 0
95 percent confidence interval:
0.5851391 0.8460314
sample estimates:
      cor
0.7425467

$versicolor

   Pearson's product-moment correlation

data: d$Sepal.Length and d$Sepal.Width
t = 4.2839, df = 48, p-value = 8.772e-05
alternative hypothesis: true correlation is not equal to 0
95 percent confidence interval:
0.2900175 0.7015599
sample estimates:
      cor
0.5259107

$virginica

   Pearson's product-moment correlation

data: d$Sepal.Length and d$Sepal.Width
t = 3.5619, df = 48, p-value = 0.0008435
alternative hypothesis: true correlation is not equal to 0
95 percent confidence interval:
0.2049657 0.6525292
sample estimates:
      cor
0.4572278

解答の選択肢 a も b も，Species の存在を考慮していない。適切な分析をすれば，正反対の結果になる。Species ごとに分析すれば，Sepal.Width と Sepal.Length 間には有意な正の相関が認められる。

ちなみに，選択肢 c は誤りであるのは明白だが，不適切な散布図では描いても意味がない。

結論として，問 1 には正解がない。

頑張ってます。クリックお願いします。

記事一覧 | 画像一覧 | フォロワー一覧 | フォトチャンネル一覧

PVアクセスランキングにほんブログ村

プロフィール

フォロー中フォローするフォローする

バックナンバー

2025年03月

2025年02月

2025年01月

2024年12月

2024年11月

2024年10月

2024年09月

2024年08月

2024年07月

2024年06月

2024年05月

2024年04月

2024年03月

2024年02月

2024年01月

2023年12月

2023年11月

2023年10月

2023年09月

2023年08月

2023年07月

2023年06月

2023年05月

2023年04月

2023年03月

2023年02月

2023年01月

2022年12月

2022年11月

2022年10月

2022年09月

2022年08月

2022年07月

2022年06月

2022年05月

2022年04月

2022年03月

2022年02月

2022年01月

2021年12月

2021年11月

2021年10月

2021年09月

2021年08月

2021年07月

2021年06月

2021年05月

2021年04月

2021年03月

2021年02月

2021年01月

2020年12月

2020年11月

2020年10月

2020年09月

2020年08月

2020年07月

2020年06月

2020年05月

2020年04月

2020年03月

2020年02月

2020年01月

2019年12月

2019年11月

2019年09月

2019年08月

2019年07月

2019年06月

2019年04月

2018年12月

2018年09月

2018年08月

2018年07月

2018年06月

2018年05月

2018年03月

2018年02月

2018年01月

2017年12月

2017年11月

2017年10月

2017年09月

2017年08月

2017年07月

2017年06月

2017年05月

2017年04月

2017年03月

2017年02月

2017年01月

2016年12月

2016年11月

2016年10月

2016年09月

2016年08月

2016年07月

2016年06月

2016年05月

2016年04月

2016年03月

2016年02月

2016年01月

2015年12月

2015年11月

2015年10月

2015年09月

2015年08月

2015年07月

2015年06月

2015年05月

2015年04月

2015年03月

2015年02月

2015年01月

2014年12月

2014年11月

2014年10月

2014年09月

2014年08月

2014年07月

2014年06月

2014年05月

2014年04月

2014年03月

2014年02月

2014年01月

2013年12月

2013年11月

2013年10月

2013年09月

2013年08月

2013年07月

2013年06月

2013年05月

2013年04月

2013年03月

2013年02月

2013年01月

2012年12月

2012年11月

2012年10月

2012年09月

2012年08月

2012年07月

2012年06月

2012年05月

2012年04月

2012年03月

2012年02月

2012年01月

2011年12月

2011年11月

2011年10月

2011年09月

2011年07月

2011年06月

2011年05月

2011年04月

2011年03月

2011年02月

2011年01月

2010年09月

2010年08月

2010年07月

カレンダー

2013年10月
日	月	火	水	木	金	土
		1	2	3	4	5
6	7	8	9	10	11	12
13	14	15	16	17	18	19
20	21	22	23	24	25	26
27	28	29	30	31

前月

次月

goo blog お知らせ

	【PR】プロ直伝・dポイントをザクザクためる術
	【PR】安い＆大量の「訳あり商品」がヤバい！
	【コメント募集中】一番好きな「漫画」は何ですか？
	訪問者数に応じてdポイント最大1,000pt当たる！
	dポイントが当たる！無料『毎日くじ』