问题描述
,当我使用a<-rt(10,3)
和b <-rnorm(10,3
)+ 5时尝试移至正确的数字,以便计算两个样本t检验的功效。我得到错误的结果。互联网上有很多文献谈到使用非中心性参数来获取移位数,以便能够计算出幂。我的问题是如何使用非中心性参数获得等于5的移位量。如果我错了,并且从t分布中获得移位数的唯一方法是在开始时介绍的方法,那么请告诉我。
desired_length<-1000
empty_list <- vector(mode = "list",length = desired_length)
empty_list1 <- vector(mode = "list",length = desired_length)
empty_list2<-vector(mode="list",length=desired_length)
empty_list3<-vector(mode="list",length=desired_length)
empty_list4<-vector(mode="list",length=desired_length)
for (i in 1:1000) {
h<-rt(10,1)
g<-rt(10,1)
g1<- rt(10,1)+0.5
g2<-rt(10,1)+1
g3<- rt(10,1)+1.5
g4<- rt(10,1)+2
a<-cbind(h,g)
b<-cbind(h,g1)
c<-cbind(h,g2)
d<-cbind(h,g3)
e<-cbind(h,g4)
empty_list[[i]]<-a
empty_list1[[i]]<-b
empty_list2[[i]]<-c
empty_list3[[i]]<-d
empty_list4[[i]]<-e
}
pvalue<-numeric(1000)
pvalue1<-numeric(1000)
pvalue2<-numeric(1000)
pvalue3<-numeric(1000)
pvalue4<-numeric(1000)
x<-numeric(5)
for (i in 1:1000){
pvalue[i]<-t.test(empty_list[[i]][,1],empty_list[[i]][,2])$p.value
pvalue1[i]<-t.test(empty_list1[[i]][,empty_list1[[i]][,2])$p.value
pvalue2[i]<-t.test(empty_list2[[i]][,empty_list2[[i]][,2])$p.value
pvalue3[i]<-t.test(empty_list3[[i]][,empty_list3[[i]][,2])$p.value
pvalue4[i]<-t.test(empty_list4[[i]][,empty_list4[[i]][,2])$p.value
}
x[1]<-sum(pvalue<0.05)/1000
x[2]<-sum(pvalue1<0.05)/1000
x[3]<-sum(pvalue2<0.05)/1000
x[4]<-sum(pvalue3<0.05)/1000
x[5]<-sum(pvalue4<0.05)/1000
location<-seq(0,2,by =0.5)
plot(location,x,ylab="Power for t1 distributions",xlab="location difference",type = "l",ylim=c(0,1))
combined_data<-matrix(data=NA,nrow = 20,ncol=1000,byrow = F)
for ( i in 1:1000){
combined_data[,i]<-c(empty_list[[i]][,2])
}
combined_data1<-matrix(data=NA,byrow = F)
for ( i in 1:1000){
combined_data1[,i]<-c(empty_list1[[i]][,2])
}
combined_data2<-matrix(data=NA,byrow = F)
for ( i in 1:1000){
combined_data2[,i]<-c(empty_list2[[i]][,2])
}
combined_data3<-matrix(data=NA,byrow = F)
for ( i in 1:1000){
combined_data3[,i]<-c(empty_list3[[i]][,2])
}
combined_data4<-matrix(data=NA,byrow = F)
for ( i in 1:1000){
combined_data4[,i]<-c(empty_list4[[i]][,2])
}
Pvalue_approximator<-function(m){
g1<-m[1:10]
g2<-m[11:20]
Tstatistic<- mean(g2)-mean(g1)
nreps=10000
G3 <- numeric(nreps)
for (i in 1:nreps) {
shuffled_data<-sample(c(m))
G1 <- (shuffled_data)[1:10]
G2 <- (shuffled_data)[11:20]
G3[i]<-mean(G2)-mean(G1)
}
m<-(sum(abs(G3) >= abs(Tstatistic))+1)/(nreps+1)
}
p<-numeric(5)
pval<-apply(combined_data,FUN=Pvalue_approximator)
p[1]<-sum( pval < 0.05)/1000
pval1<-apply(combined_data1,FUN=Pvalue_approximator)
p[2]<-sum( pval1 < 0.05)/1000
pval2<-apply(combined_data2,FUN=Pvalue_approximator)
p[3]<-sum( pval2 < 0.05)/1000
pval3<-apply(combined_data3,FUN=Pvalue_approximator)
p[4]<-sum( pval3 < 0.05)/1000
pval4<-apply(combined_data4,FUN=Pvalue_approximator)
p[5]<-sum( pval4 < 0.05)/1000
lines(location,p,col="red",lty=2)
Diff.med.Pvalue_approximator<-function(m){
g1<-m[1:10]
g2<-m[11:20]
a<-abs(c(g1-median(c(g1))))
b<-abs(c(g2-median(c(g2))))
ab<-2*median(c(a,b))
ac<-abs(median(c(g2))-median(c(g1)))
Tstatistic =ac/ab
nreps=10000
G3 <- numeric(nreps)
for (i in 1:nreps) {
shuffled_data<-sample(c(m))
G1 <- (shuffled_data)[1:10]
G2 <- (shuffled_data)[11:20]
o<-abs(c(G1-median(c(G1))))
v<-abs(c(G2-median(c(G2))))
ov<-2*median(c(o,v))
oc<-abs(median(c(G2))-median(c(G1)))
G3[i]<- oc/ov
}
m<-(sum(G3 >= Tstatistic)+1)/(nreps+1)
}
po<-numeric(5)
pval<-apply(combined_data,FUN=Diff.med.Pvalue_approximator)
po[1]<-sum( pval < 0.05)/1000
pval1<-apply(combined_data1,FUN=Diff.med.Pvalue_approximator)
po[2]<-sum( pval1 < 0.05)/1000
pval2<-apply(combined_data2,FUN=Diff.med.Pvalue_approximator)
po[3]<-sum( pval2 < 0.05)/1000
pval3<-apply(combined_data3,FUN=Diff.med.Pvalue_approximator)
po[4]<-sum( pval3 < 0.05)/1000
pval4<-apply(combined_data4,FUN=Diff.med.Pvalue_approximator)
po[5]<-sum(pval4 < 0.05)/1000
lines(location,po,col="green",lty=1)
wilcoxon.Pvalue_approximator<-function(m){
g1<-m[1:10]
g2<-m[11:20]
l = length(g1)
rx = rank(c(g1,g2))
rf<-rx[11:20]
Tstatistic<-sum(rf)
nreps=10000
G3 <- numeric(nreps)
for (i in 1:nreps) {
shuffled_data<-sample(c(m))
G1 <- (shuffled_data)[1:10]
G2 <- (shuffled_data)[11:20]
rt<-rank(c(G1,G2))
ra<-rt[11:20]
G3[i]<-sum(ra)
}
m<-2*(sum(abs(G3) >= abs(Tstatistic))+1)/(nreps+1)
}
pw<-numeric(5)
pval<-apply(combined_data,FUN=wilcoxon.Pvalue_approximator)
pw[1]<-sum( pval < 0.05)/1000
pval1<-apply(combined_data1,FUN=wilcoxon.Pvalue_approximator)
pw[2]<-sum( pval1 < 0.05)/1000
pval2<-apply(combined_data2,FUN=wilcoxon.Pvalue_approximator)
pw[3]<-sum( pval2 < 0.05)/1000
pval3<-apply(combined_data3,FUN=wilcoxon.Pvalue_approximator)
pw[4]<-sum( pval3 < 0.05)/1000
pval4<-apply(combined_data4,FUN=wilcoxon.Pvalue_approximator)
pw[5]<-sum( pval4 < 0.05)/1000
lines(location,pw,col="blue",lty=1)
HLE2.Pvalue_approximator<-function(m){
g1<-m[1:10]
g2<-m[11:20]
u<-median(c(g1))
v<-median(c(g2))
x<-c(g1-u)
y<-c(g2-v)
xy<-c(x,y)
a<-outer(xy,xy,"-")
t<-a[lower.tri(a)]
ab<- median(c(abs(t)))
ac<-abs(median(c(outer(g2,g1,"-"))))
Tstatistic = ac/ab
nreps=10000
G3 <- numeric(nreps)
for (i in 1:nreps) {
shuffled_data<-sample(c(m))
G1 <- (shuffled_data)[1:10]
G2 <- (shuffled_data)[11:20]
f<-median(c(G1))
h<-median(c(G2))
p<-c(G1-f)
r<-c(G2-h)
pr<-c(p,r)
pu<-outer(pr,pr,"-")
xc<-pu[lower.tri(pu)]
b<- median(c(abs(xc)))
acn<-abs(median(c(outer(G2,G1,"-"))))
G3[i]<- acn/b
}
m<-(sum(G3 >= Tstatistic)+1)/(nreps+1)
}
phl<-numeric(5)
pval<-apply(combined_data,FUN=HLE2.Pvalue_approximator)
phl[1]<-sum( pval < 0.05)/1000
pval1<-apply(combined_data1,FUN=HLE2.Pvalue_approximator)
phl[2]<-sum( pval1 < 0.05)/1000
pval2<-apply(combined_data2,FUN=HLE2.Pvalue_approximator)
phl[3]<-sum( pval2 < 0.05)/1000
pval3<-apply(combined_data3,FUN=HLE2.Pvalue_approximator)
phl[4]<-sum( pval3 < 0.05)/1000
pval4<-apply(combined_data4,FUN=HLE2.Pvalue_approximator)
phl[5]<-sum( pval4 < 0.05)/1000
lines(location,phl,col="orange",lty=1)
HLE1.Pvalue_approximator<-function(m){
g1<-m[1:10]
g2<-m[11:20]
u<-median(c(g1))
v<-median(c(g2))
x<-c(g1-u)
y<-c(g2-v)
xy<-c(x,"-")
t<-a[lower.tri(a)]
ab<- median(c(abs(t)))
ma<-outer(g2,g2,"+")
deno1<-median(c(ma[lower.tri(ma)]/2))
mn<-outer(g1,"+")
deno2<-median(c(mn[lower.tri(mn)]/2))
ac<-abs(deno1-deno2)
Tstatistic =ac/ab
nreps=10000
G3 <- numeric(nreps)
for (i in 1:nreps) {
shuffled_data<-sample(c(m))
G1 <- (shuffled_data)[1:10]
G2 <- (shuffled_data)[11:20]
f<-median(c(G1))
h<-median(c(G2))
p<-c(G1-f)
r<-c(G2-h)
pr<-c(p,"-")
xc<-pu[lower.tri(pu)]
b<- median(c(abs(xc)))
mas<-outer(G2,G2,"+")
dn1<-median(c(mas[lower.tri(mas)]/2))
mns<-outer(G1,"+")
dn2<-median(c(mns[lower.tri(mns)]/2))
an<-abs(dn2-dn1)
G3[i]<- an/b
}
m<-(sum(G3 >= Tstatistic)+1)/(nreps+1)
}
pl<-numeric(5)
pval<-apply(combined_data,FUN=HLE1.Pvalue_approximator)
pl[1]<-sum( pval < 0.05)/1000
pval1<-apply(combined_data1,FUN=HLE1.Pvalue_approximator)
pl[2]<-sum( pval1 < 0.05)/1000
pval2<-apply(combined_data2,FUN=HLE1.Pvalue_approximator)
pl[3]<-sum( pval2 < 0.05)/1000
pval3<-apply(combined_data3,FUN=wilcoxon.Pvalue_approximator)
pl[4]<-sum( pval3 < 0.05)/1000
pval4<-apply(combined_data4,FUN=wilcoxon.Pvalue_approximator)
pl[5]<-sum( pval4 < 0.05)/1000
lines(location,pl,col="brown",lty=1)
median_Pvalue_approximator<-function(m){
g1<-m[1:10]
g2<-m[11:20]
rt<-rank(c(g1,g2))
rt<-rt[11:20]
Tstatistic<-sum(rt > 10.5)
nreps=10000
G3 <- numeric(nreps)
for (i in 1:nreps) {
shuffled_data<-sample(c(m))
G1 <- (shuffled_data)[1:10]
G2 <- (shuffled_data)[11:20]
ra<-rank(c(G1,G2))
ra<-ra[11:20]
G3[i]<-sum(ra > 10.5)
}
m<-(sum(G3 >= Tstatistic)+1)/(nreps+1)
}
pm<-numeric(5)
pval<-apply(combined_data,FUN=median_Pvalue_approximator)
pm[1]<-sum( pval < 0.05)/1000
pval1<-apply(combined_data1,FUN=median_Pvalue_approximator)
pm[2]<-sum( pval1 < 0.05)/1000
pval2<-apply(combined_data2,FUN=median_Pvalue_approximator)
pm[3]<-sum( pval2 < 0.05)/1000
pval3<-apply(combined_data3,FUN=median_Pvalue_approximator)
pm[4]<-sum( pval3 < 0.05)/1000
pval4<-apply(combined_data4,FUN=median_Pvalue_approximator)
pm[5]<-sum( pval4 < 0.05)/1000
lines(location,pm,col="yellow",lty=1)
legend("topleft",legend=c("t.test","HLE2","HLE","Diff.med","median","wilcoxon","mean diff"),col=c( "black","orange","brown","green","yellow","blue","red"),lty=c(1,1,2),cex=0.8,text.font=4,bg='white')
解决方法
好,我们有t分布,可以写成
T(n)= N(0,1)*√[n /χ 2 (n)]
其中N(0,1)是标准正态,而χ 2 (n)是Chi-squared distribtion。这是很标准的东西。
如果要转移分布,请添加转移μ
,所以
T(n)+μ= N(0,1)*√[n /χ 2 (n)] +μ(1)
如果我们要使非中心参数(NCP)等于μ,并且Non-central t-distribution,我们需要在上面的表达式中移动高斯变量
T(n,NCP =μ)= N(μ,1)*√[n /χ 2 (n)] =(N(0,1)+μ)*√[ n /χ 2 (n)] =
= N(0,1)*√[n /χ 2 (n)] +μ*√[n /χ 2 (n)](2 )
您看到区别了吗?在eq(1)中,我们添加常数。在eq(2)中,我们添加常数乘以一些难看的随机变量。这些分布是不同的,并且会产生不同的结果。小心使用。
标准T(n)
将是对称wrt 0,而T(n)+μ
将是对称wrt μ
,但是非中心T将具有不对称性,您将对称T(n)与不对称项μ*√[n /χ 2 (n)]。您可以在Wikipedia中针对非中心T(n)
更新
运行您的代码(是的,花费了相当长的时间,可能超过12个小时),我知道了
UPDATE II
我现在对Python有点熟悉,所以我用Python重新编码了一部分测试并运行了它,它几乎是即时的,对于df = 3的t分布,我离Python更近了。纸图,值最高为0.8。您也可以快速制作df = 1的图形,并且再次应接近纸张结果。或者,您可以将rng.standard_t
替换为rng.normal(size=N)
,然后以大的平移获得接近1幂的图形。
代码
import numpy as np
from scipy import stats
import matplotlib.pyplot as plt
rng = np.random.default_rng(312345)
N = 10 # Sample Size
α = 0.05
shift = [0.0,0.5,1.0,1.5,2.0]
power = np.zeros(len(shift))
for k in range(0,len(shift)):
s = shift[k] # current shift
c = 0 # counter how many times we reject
for _ in range(0,1000):
a = rng.standard_t(df=3,size=N) # baseline sample
b = rng.standard_t(df=3,size=N) + s # sample with shift
t,p = stats.ttest_ind(a,b,equal_var=True) # t-Test from two independent samples,assuming equal variance
if p <= α:
c += 1
power[k] = float(c)/1000.0
fig = plt.figure()
ax = fig.add_subplot(2,1,1)
ax.plot(shift,power,'r-')
plt.show()
和图形
UPDATE III
这是R代码,非常类似于Python,并且制作了相同的图
N <- 10
shift <- c(0.,2.0)
power <- c(0.,0.,0.)
av <- 0.05
samples <- function(n) {
rchisq(n,df=3) #rnorm(n) #rt(n,df=3) #rt(n,df=1)
}
pvalue <- function(a,b) {
t.test(a,var.equal = TRUE)$p.value
}
for (k in 1:5) {
s <- shift[k]
p <- replicate(1000,pvalue(samples(N),samples(N) + s))
cc <- sum(p <= av)
power[k] <- cc/1000.0
}
plot(shift,type="l")
UPDATE IV
不,我无法在R和Python中获得图1(右下角χ 2 (3))的(纸上)t检验图。我得到的是下面的图。
,您正在寻找ncp
的{{1}}(在 C 实体 P 参数上的 N )
rt()
查看帮助文件,以了解如何设置ncp参数。