前言
本文用c语言编写t检验和参数估计的程序,并将基因区和非基因区上突变率作为实例分析。
假设检验
t检验最关键的步骤就是求出t值,下面给我们就将t检验的公式转化为代码。
平均数
公式中要计算平均数,下面给出计算平均数的函数 其中x是一个double型数组
double average(double *x, int len)
{
double sum = 0;
for (int i = 0; i <len;i++)
sum += x[i];
return sum/len;
}
方差
公式中要计算方差,下面给出计算方差的函数
double variance(double *x, int len)
{
double average(double *x, int len);
double avg=average(x,len);
double sum=0;
for (int i = 0; i<len;i++)
sum += pow(x[i] - avg, 2);
return sum/(len-1);
}
计算t值
double t_test(double *x,double *y,int len_x,int len_y)
{
double average(double *x, int len);
double variance(double *x, int len);
double avg_x,avg_y,var_x,var_y;
avg_x=average(x,len_x);
avg_y=average(y,len_y);
var_x=variance(x,len_x);
var_y=variance(y,len_y);
int df=len_x+len_y-2;
double s_e,s1_2,t;
s_e = (var_x*(len_x-1)+var_y*(len_y-1))/(df);
s1_2 = sqrt(s_e/len_x+s_e/len_y);
t = (avg_x-avg_y)/s1_2;
t=abs(t);
cout<<"T-value is "<<t<<endl;
if(t>1.96)
{
cout<<"t-vale > T_0.05 (1.96)"<<endl<<"So there is significant difference"<<endl;
}
return 0;
}
参数估计
我们一样是将公式转化为代码~
代码
double parameter_estimation(double *x,int len_x)
{
double average(double *x, int len);
double variance(double *x, int len);
double avg_x,var_x,sx,l1,l2;
avg_x=average(x,len_x);
var_x=variance(x,len_x);
sx=var_x/sqrt(len_x);
l1=avg_x-2.861*sx;
l2=avg_x+2.861*sx;
cout<<"Interval Estimation :"<<"[ "<<l1<<" , "<<l2<<" ]"<<endl;
cout<<" Point Estimation :"<<avg_x<<" +/- "<<2.861*sx<<endl;
}
实例
对基因片段上的突变率和非基因上的突变率进行t检验,判断它们之间有没有显著性差异
直观可视化
C语言t检验全部代码
#include <iostream>
#include <fstream>
#include <iomanip>
#include <sstream>
#include <string>
#include<cmath>
using namespace std;
int main()
{
int len_x=5921,len_y=5937;
double x[len_x],y[len_y];
ifstream inFile("gene_new.CSV", ios::in);
if (!inFile)
{
cout << "打开文件失败!" << endl;
exit(1);
}
int i = 0;
string line;
string field;
while (getline(inFile, line))
{
string field;
istringstream sin(line);
getline(sin, field, ',');
x[i]=atof(field.c_str());
i++;
}
inFile.close();
ifstream inFile3("nongene_new.CSV", ios::in);
if (!inFile3)
{
cout << "打开文件失败!" << endl;
exit(1);
}
i = 0;
while (getline(inFile3, line))
{
string field;
istringstream sin(line);
getline(sin, field, ',');
y[i]=atof(field.c_str());
i++;
}
inFile.close();
double t_test(double *x,double *y,int len_x,int len_y);
cout<<"T-test of mutation rates on gene fragments and non-gene fragments: "<<endl<<endl;
t_test(x,y,len_x,len_y);
cout<<endl;
double parameter_estimation(double *x,int len_x);
cout<<endl<<"Parameter estimation of the mutation rate on genetic fragment: "<<endl<<endl;
parameter_estimation(x,len_x);
cout<<endl<<"Parameter estimation of the mutation rate on non-genetic fragment: "<<endl<<endl;
parameter_estimation(y,len_y);
cout<<endl<<endl<<endl;
return 0;
}
double average(double *x, int len)
{
double sum = 0;
for (int i = 0; i <len;i++)
sum += x[i];
return sum/len;
}
double variance(double *x, int len)
{
double average(double *x, int len);
double avg=average(x,len);
double sum=0;
for (int i = 0; i<len;i++)
sum += pow(x[i] - avg, 2);
return sum/(len-1);
}
double t_test(double *x,double *y,int len_x,int len_y)
{
double average(double *x, int len);
double variance(double *x, int len);
double avg_x,avg_y,var_x,var_y;
avg_x=average(x,len_x);
avg_y=average(y,len_y);
var_x=variance(x,len_x);
var_y=variance(y,len_y);
int df=len_x+len_y-2;
double s_e,s1_2,t;
s_e = (var_x*(len_x-1)+var_y*(len_y-1))/(df);
s1_2 = sqrt(s_e/len_x+s_e/len_y);
t = (avg_x-avg_y)/s1_2;
t=abs(t);
cout<<"T-value is "<<t<<endl;
if(t>1.96)
{
cout<<"t-vale > T_0.05 (1.96)"<<endl<<"So there is significant difference"<<endl;
}
return 0;
}
double parameter_estimation(double *x,int len_x)
{
double average(double *x, int len);
double variance(double *x, int len);
double avg_x,var_x,sx,l1,l2;
avg_x=average(x,len_x);
var_x=variance(x,len_x);
sx=var_x/sqrt(len_x);
l1=avg_x-2.861*sx;
l2=avg_x+2.861*sx;
cout<<"Interval Estimation :"<<"[ "<<l1<<" , "<<l2<<" ]"<<endl;
cout<<" Point Estimation :"<<avg_x<<" +/- "<<2.861*sx<<endl;
}
结果
下面两个图分别是r语言中自己编写的和内置函数给出的结果,可以看出来t值算的都是一样的。 结论就是:基因区和非基因区上的突变率有显著性差异。
|