Browse code

added code to allow genotype calling for Illumina chips without the need for region specific package

git-svn-id: file:///home/git/hedgehog.fhcrc.org/bioconductor/trunk/madman/Rpacks/crlmm@126739 bc3139a8-67e5-0310-9ffc-ced21a209358

unknown authored on 14/02/2017 03:47:52
Showing3 changed files

... ...
@@ -58,10 +58,9 @@ importFrom(mvtnorm, rmvnorm)
58 58
 
59 59
 importFrom(oligoClasses, celfileDate, chromosome2integer, i2p,
60 60
            initializeBigMatrix, initializeBigVector, integerMatrix,
61
-	   isPackageLoaded,
62
-           ldPath, ocLapply, ocProbesets, ocSamples,
63
-	   parStatus,
64
-           splitIndicesByLength, splitIndicesByNode, AssayDataList)
61
+           isPackageLoaded, ldPath, ocLapply, ocProbesets, ocSamples,
62
+           parStatus, splitIndicesByLength, splitIndicesByNode, 
63
+           AssayDataList, genomeBuild, celfileName)
65 64
 
66 65
 importFrom(preprocessCore, normalize.quantiles, normalize.quantiles.determine.target,
67 66
            normalize.quantiles.use.target, subColSummarizeMedian)
... ...
@@ -85,6 +84,8 @@ importFrom(parallel, makeCluster, detectCores, parRapply, stopCluster, clusterEv
85 84
 
86 85
 importFrom(limma, loessFit)
87 86
 
87
+importFrom(ff, delete)
88
+
88 89
 ##----------------------------------------------------------------------------
89 90
 ## export
90 91
 ##----------------------------------------------------------------------------
... ...
@@ -25,8 +25,7 @@ krlmm <- function(cnSet, cdfName, gender=NULL, trueCalls=NULL, verbose=TRUE) {
25 25
     colnames(scores) = colnames(M)  
26 26
     
27 27
     priormeans = calculatePriorValues(M, numSNP, verbose)
28
-    VGLMparameters = calculateParameters(M, priormeans, numSNP, verbose)
29
-  
28
+      
30 29
 ### retrieve or calculate coefficients
31 30
     krlmmCoefficients = getKrlmmVGLMCoefficients(pkgname, trueCalls, VGLMparameters, verbose, numSample, colnames(M))
32 31
     
... ...
@@ -83,7 +82,7 @@ krlmmnopkg <- function(cnSet, offset, gender=NULL, normalize.method=NULL, annota
83 82
     }
84 83
 
85 84
     message("Leaving out non-variant SNPs")
86
-
85
+    
87 86
     # For SNPs with less than 3 distinct data point, exclude them from downstream analysis
88 87
     uniqueCount = apply(M[,], 1, function(x){length(unique(x))})
89 88
     SNPtoProcessIndex = uniqueCount >= 3
... ...
@@ -113,8 +112,18 @@ krlmmnopkg <- function(cnSet, offset, gender=NULL, normalize.method=NULL, annota
113 112
     }
114 113
     colnames(trueCalls)=colnames(M)
115 114
     rownames(trueCalls)=rownames(M)
116
-
117
-    priormeans = calculatePriorValues(M, numSNP, verbose)
115
+if(numSample<=30){
116
+  priormeans=prepriormeans
117
+}else{
118
+  priormeans = calculatePriorValues(M, numSNP, verbose)
119
+    if((abs(priormeans[1]-prepriormeans[1])+abs(priormeans[2]-prepriormeans[2])+abs(priormeans[3]-prepriormeans[3]))>=3){
120
+priormeans=prepriormeans
121
+}else{
122
+      priormeans=priormeans
123
+  }
124
+ }
125
+ 
126
+    
118 127
     VGLMparameters = calculateParameters(M, priormeans, numSNP, verbose)
119 128
 
120 129
 ### retrieve or calculate coefficients
... ...
@@ -131,23 +140,24 @@ krlmmnopkg <- function(cnSet, offset, gender=NULL, normalize.method=NULL, annota
131 140
 ### assign confidence scores
132 141
     computeCallPr(scores, M, calls, numSNP, numSample, verbose)
133 142
 
134
-    YIndex = seq(1:nrow(cnSet))[chromosome(cnSet)==24 & isSnp(cnSet)]
135
-
143
+   YIndex = seq(1:nrow(cnSet))[chromosome(cnSet)==24 & isSnp(cnSet)]
144
+    XIndex = seq(1:nrow(cnSet))[chromosome(cnSet)==23 & isSnp(cnSet)]
136 145
 ### impute gender if gender information not provided
137 146
     if (is.null(gender)) {
138
-        gender = krlmmImputeGender(cnSet, annotation, YIndex, verbose, offset=offset)
147
+       gender = krlmmImputeGender(cnSet, annotation, YIndex, verbose, offset=offset)
139 148
     }
140 149
 
141
-### double-check ChrY SNP cluster, impute gender if gender information not provided
150
+### double-check ChrY ChrX SNP cluster, impute gender if gender information not provided
142 151
     if (!(is.null(gender))) {
143 152
         verifyChrYSNPs(cnSet, M, calls, gender, annotation, YIndex, priormeans, verbose)
153
+        verifyChrXSNPs(cnSet, M, calls, gender, annotation, XIndex, priormeans, verbose)
144 154
     }
145 155
 
146 156
 #    if (length(YIndex)>0  && !(is.null(gender))) {
147 157
 #      verifyChrYSNPs(cnSet, M, calls, gender, annotation, YIndex, priormeans, verbose)
148 158
 #    }
149 159
 ### add back SNPs excluded before
150
-    AddBackNoVarianceSNPs(cnSet, calls, scores, numSNP, numSample, SNPtoProcessIndex, noVariantSNPIndex)
160
+   AddBackNoVarianceSNPs(cnSet, calls, scores, numSNP, numSample, SNPtoProcessIndex, noVariantSNPIndex)
151 161
 
152 162
     close(calls)
153 163
     close(scores)
... ...
@@ -170,7 +180,7 @@ loessnormalization<- function(cnSet, verbose, offset=16, blockSize = 300000){
170 180
 
171 181
     numSNP <- nrow(A)
172 182
     numSample <- ncol(A)
173
-    library(limma)
183
+   
174 184
 
175 185
     M <- oligoClasses::initializeBigMatrix(name="M", numSNP, numSample, vmode = "double")
176 186
     S <- oligoClasses::initializeBigMatrix(name="S", numSNP, numSample, vmode = "double")
... ...
@@ -927,7 +937,7 @@ krlmmImputeGender <- function(cnSet, annotation, YIndex, verbose, offset=0){
927 937
     S = computeAverageLogIntensity(cnSet, verbose, offset=offset) 
928 938
 
929 939
     # S is calculated and saved in original SNP order. 
930
-    matchy = match(annotation[YIndex, "Name"], rownames(S))
940
+    matchy = match(rownames(annotation)[YIndex], rownames(S))
931 941
     matchy = matchy[!is.na(matchy)]
932 942
     if (length(matchy) <= 10){
933 943
         predictedGender = rep(NA, ncol(S))
... ...
@@ -955,8 +965,8 @@ krlmmImputeGender <- function(cnSet, annotation, YIndex, verbose, offset=0){
955 965
     priorS = apply(allS, 2, FUN="median", na.rm=TRUE)
956 966
 
957 967
     if (abs(priorS[1] - priorS[2]) <= 1.6) {
958
-        message("Separation between clusters too small (samples probabaly all the same gender): skipping gender prediction step");
959
-        predictedGender = rep(NA, ncol(Sy))
968
+        message("Separation between clusters too small (samples probabaly all the same gender): ");
969
+      
960 970
     }
961 971
     
962 972
     meanmatrix = apply(Sy, 2, median)
... ...
@@ -986,7 +996,7 @@ verifyChrYSNPs <- function(cnSet, M, calls, gender, annotation, YIndex, priormea
986 996
     open(callsGt)
987 997
     open(callsPr)
988 998
        
989
-    matchy = match(annotation[YIndex, "Name"], rownames(M))
999
+    matchy = match(rownames(annotation)[YIndex], rownames(M))
990 1000
     matchy = matchy[!is.na(matchy)]
991 1001
    
992 1002
     MChrY = M[matchy,]
... ...
@@ -1031,3 +1041,58 @@ verifyChrYSNPs <- function(cnSet, M, calls, gender, annotation, YIndex, priormea
1031 1041
     
1032 1042
     if (verbose) message("Done verifying SNPs on Chromosome Y")
1033 1043
 }
1044
+
1045
+
1046
+
1047
+
1048
+#######################################
1049
+verifyChrXSNPs <- function(cnSet, M, calls, gender, annotation, XIndex, priormeans, verbose){
1050
+    if (verbose) message("Start verifying SNPs on Chromosome X")
1051
+    callsGt = calls(cnSet)
1052
+    callsPr = snpCallProbability(cnSet)
1053
+    open(callsGt)
1054
+    open(callsPr)
1055
+      
1056
+    matchx = match(rownames(annotation)[XIndex], rownames(M))
1057
+    matchx = matchx[!is.na(matchx)]
1058
+   
1059
+    MChrX= M[matchx,]
1060
+    callsChrX = callsGt[matchx,]
1061
+  
1062
+    male = gender == 1
1063
+    female = gender == 2    
1064
+    
1065
+    checkK = apply(callsChrX[, male], 1, function(x){ length(unique(x[!is.na(x)])) } )
1066
+    
1067
+    for(i in 1:nrow(MChrX)){
1068
+      
1069
+        # Chromosome X SNPs for male, k = 1 or 2 permitted for male samples
1070
+thisChrXSNPorigPosition = match(rownames(callsChrX)[i], rownames(callsGt))
1071
+        # early exit for k = 1 or 2 cases. Only re-assign calls to male samples if we previouly assigned
1072
+        # male samples to 3 clusters by mistake. 
1073
+        if (checkK[i] < 3) next;
1074
+          
1075
+        if (class(try(kmeans(MChrX[i, male], c(priormeans[1], priormeans[3]), nstart=1), TRUE)) != "try-error"){
1076
+           
1077
+            maleSampleCalls = kmeans(MChrX[i, male],c(priormeans[1], priormeans[3]), nstart=45)$cluster
1078
+            callsGt[thisChrXSNPorigPosition, male][maleSampleCalls == 1] = 1
1079
+            callsGt[thisChrXSNPorigPosition, male][maleSampleCalls == 2] = 3
1080
+        } else {
1081
+                    
1082
+            distanceToPrior1 = mean(abs(MChrX[i, male] - priormeans[1]))
1083
+            distanceToPrior3 = mean(abs(MChrX[i, male] - priormeans[3]))                
1084
+            callsGt[thisChrXSNPorigPosition, male] = ifelse(distanceToPrior1 < distanceToPrior3, 1, 3)
1085
+        }
1086
+
1087
+        MMaleSamples = MChrX[i, male]
1088
+        callsMaleSample = callsGt[thisChrXSNPorigPosition, male]
1089
+        scoresMaleSample = .Call("krlmmConfidenceScore", t(as.matrix(MMaleSamples)), t(as.matrix(callsMaleSample)), PACKAGE="crlmm");
1090
+
1091
+       callsPr[thisChrXSNPorigPosition, male] = scoresMaleSample       
1092
+    }
1093
+
1094
+    close(callsGt)
1095
+    close(callsPr)
1096
+    
1097
+    if (verbose) message("Done verifying SNPs on Chromosome X")
1098
+}
... ...
@@ -422,12 +422,12 @@ SEXP krlmmComputeM(SEXP A, SEXP B){
422 422
 
423 423
     INTERNAL VARIABLES
424 424
     -------- ---------
425
-    rowsAB, colsAB: dimensions of A and B, which is number of SNP and number of sample
425
+    num_SNP, num_sample: dimensions of A and B, which is number of SNP and number of sample
426 426
   */
427 427
 
428
-  long rowsAB, colsAB;
429
-  rowsAB = INTEGER(getAttrib(A, R_DimSymbol))[0];
430
-  colsAB = INTEGER(getAttrib(A, R_DimSymbol))[1];
428
+  long num_SNP, num_sample;
429
+  num_SNP = INTEGER(getAttrib(A, R_DimSymbol))[0];
430
+  num_sample = INTEGER(getAttrib(A, R_DimSymbol))[1];
431 431
 
432 432
   A = coerceVector(A, REALSXP);
433 433
   B = coerceVector(B, REALSXP);
... ...
@@ -435,16 +435,16 @@ SEXP krlmmComputeM(SEXP A, SEXP B){
435 435
   long i, j;
436 436
 
437 437
   SEXP Rval;
438
-  PROTECT(Rval = allocMatrix(REALSXP, rowsAB, colsAB));
438
+  PROTECT(Rval = allocMatrix(REALSXP, num_SNP, num_sample));
439 439
 
440 440
   double *ptr2M;
441 441
   long elepos;
442 442
   ptr2M = NUMERIC_POINTER(Rval);
443 443
 
444
-  for (i = 1; i <= rowsAB; i++){
445
-    for (j = 1; j <= colsAB; j++){
444
+  for (i = 1; i <= num_SNP; i++){
445
+    for (j = 1; j <= num_sample; j++){
446 446
       // elepos is an index for A, B and M
447
-      elepos = CMatrixElementPosition(i, j, rowsAB);
447
+      elepos = CMatrixElementPosition(i, j, num_SNP);
448 448
       ptr2M[elepos] = (log2(REAL(A)[elepos])-log2(REAL(B)[elepos]));
449 449
     }
450 450
   }
... ...
@@ -459,34 +459,34 @@ SEXP krlmmComputeS(SEXP A, SEXP B){
459 459
   /*
460 460
     ARGUMENTS
461 461
     ---------
462
-    A: intensity matrix for allele A11
462
+    A: intensity matrix for allele A
463 463
     B: intensity matrix for allele B
464 464
     S: average log-intensity for a particular SNP (outgoing)
465 465
 
466 466
     INTERNAL VARIABLES
467 467
     -------- ---------
468
-    rowsAB, colsAB: dimensions of A and B, which is number of SNP and number of sample
468
+    num_SNP, num_sample: dimensions of A and B, which is number of SNP and number of sample
469 469
   */
470 470
 
471
-  long rowsAB, colsAB;
472
-  rowsAB = INTEGER(getAttrib(A, R_DimSymbol))[0];
473
-  colsAB = INTEGER(getAttrib(A, R_DimSymbol))[1];
471
+  long num_SNP, num_sample;
472
+  num_SNP = INTEGER(getAttrib(A, R_DimSymbol))[0];
473
+  num_sample = INTEGER(getAttrib(A, R_DimSymbol))[1];
474 474
 
475 475
   A = coerceVector(A, REALSXP);
476 476
   B = coerceVector(B, REALSXP); 
477 477
   long i, j;
478 478
 
479 479
   SEXP Rval;
480
-  PROTECT(Rval = allocMatrix(REALSXP, rowsAB, colsAB));
480
+  PROTECT(Rval = allocMatrix(REALSXP, num_SNP, num_sample));
481 481
 
482 482
   double *ptr2S;
483 483
   long elepos;
484 484
   ptr2S = NUMERIC_POINTER(Rval);
485 485
 
486
-  for (i = 1; i <= rowsAB; i++){
487
-    for (j = 1; j <= colsAB; j++){
486
+  for (i = 1; i <= num_SNP; i++){
487
+    for (j = 1; j <= num_sample; j++){
488 488
       // elepos is an index for A, B and S
489
-      elepos = CMatrixElementPosition(i, j, rowsAB);
489
+      elepos = CMatrixElementPosition(i, j, num_SNP);
490 490
       ptr2S[elepos] = (log2(REAL(A)[elepos])+log2(REAL(B)[elepos]))/2;
491 491
     }
492 492
   }
... ...
@@ -495,15 +495,16 @@ SEXP krlmmComputeS(SEXP A, SEXP B){
495 495
   return Rval;
496 496
 }
497 497
 
498
-void calculate_multiple_cluster_scores(int row, double *intensity, double mean_intensity, int *clustering, int num_SNP, int num_sample, int *ptr, double **dist, int *clustercount){
499
-    int p, q, o;
498
+void calculate_multiple_cluster_scores(long row, double *intensity, double mean_intensity, int *clustering, long num_SNP, long num_sample, int *ptr, double **dist, long *clustercount){
499
+    long p, q;
500
+    int o;
500 501
     long vectorPos;
501 502
     double a_dist;
502 503
     for(p=1; p <= num_sample; p++){
503 504
         dist[p][p] = 0;
504 505
     }
505
-    for(p=1; p<num_sample; p++){
506
-        for(q=p+1; q<=num_sample; q++){
506
+    for(p=1; p < num_sample; p++){
507
+        for(q = p+1; q <= num_sample; q++){
507 508
    	    a_dist = fabs(intensity[CMatrixElementPosition(row, p, num_SNP)] - intensity[CMatrixElementPosition(row, q, num_SNP)]);
508 509
             dist[p][q] = a_dist;
509 510
             dist[q][p] = a_dist;
... ...
@@ -521,7 +522,7 @@ void calculate_multiple_cluster_scores(int row, double *intensity, double mean_i
521 522
         sum[2] = 0;
522 523
         sum[3] = 0;
523 524
         for (q=1; q<=num_sample; q++){
524
-	  sum[clustering[CMatrixElementPosition(row, q, num_SNP)]] += dist[p][q];
525
+            sum[clustering[CMatrixElementPosition(row, q, num_SNP)]] += dist[p][q];
525 526
         }
526 527
        
527 528
         within_cluster = sum[clustering[vectorPos]] / (clustercount[clustering[vectorPos]]- 1);
... ...
@@ -546,9 +547,9 @@ void calculate_multiple_cluster_scores(int row, double *intensity, double mean_i
546 547
     }   
547 548
 }
548 549
 
549
-void calculate_unique_cluster_scores(int row, double *intensity, double mean_intensity, double intensity_range, int num_SNP, int num_sample, int *ptr)				   
550
+void calculate_unique_cluster_scores(long row, double *intensity, double mean_intensity, double intensity_range, long num_SNP, long num_sample, int *ptr)				   
550 551
 {
551
-    int p;
552
+    long p;
552 553
     long vectorPos;
553 554
 
554 555
     for(p = 1; p <= num_sample; p++){
... ...
@@ -558,24 +559,24 @@ void calculate_unique_cluster_scores(int row, double *intensity, double mean_int
558 559
 }
559 560
 
560 561
 
561
-double calculate_SNP_mean(int row, double *intensity, int num_SNP, int num_sample)
562
+double calculate_SNP_mean(long row, double *intensity, long num_SNP, long num_sample)
562 563
 {
563 564
   double sum_intensity;
564 565
   sum_intensity = 0;
565
-  int p;
566
+  long p;
566 567
   for(p = 1; p <= num_sample; p++){
567 568
     sum_intensity = sum_intensity + intensity[CMatrixElementPosition(row, p, num_SNP)];
568 569
   }
569 570
   return(sum_intensity / num_sample);
570 571
 }
571 572
 
572
-double calculate_SNP_range(int row, double *intensity, int num_SNP, int num_sample)
573
+double calculate_SNP_range(long row, double *intensity, long num_SNP, long num_sample)
573 574
 {
574 575
   double min_intensity;
575 576
   double max_intensity;
576 577
   max_intensity = intensity[CMatrixElementPosition(row, 1, num_SNP)];
577 578
   min_intensity = intensity[CMatrixElementPosition(row, 1, num_SNP)];
578
-  int p;  
579
+  long p;  
579 580
   for(p = 2; p <= num_sample; p++){
580 581
     if (intensity[CMatrixElementPosition(row, p, num_SNP)] < min_intensity){
581 582
       min_intensity = intensity[CMatrixElementPosition(row, p, num_SNP)];  
... ...
@@ -590,7 +591,7 @@ double calculate_SNP_range(int row, double *intensity, int num_SNP, int num_samp
590 591
 
591 592
 SEXP krlmmConfidenceScore(SEXP M, SEXP clustering)
592 593
 {
593
-    int num_SNP, num_sample;
594
+    long num_SNP, num_sample;
594 595
     num_SNP = INTEGER(getAttrib(M, R_DimSymbol))[0];
595 596
     num_sample = INTEGER(getAttrib(M, R_DimSymbol))[1];
596 597
 
... ...
@@ -598,7 +599,7 @@ SEXP krlmmConfidenceScore(SEXP M, SEXP clustering)
598 599
     ptr2cluster = INTEGER_POINTER(AS_INTEGER(clustering));
599 600
     M = coerceVector(M, REALSXP);
600 601
   
601
-    int i, j;
602
+    long i, j;
602 603
     int cluster;
603 604
     double mean_intensity;
604 605
     double intensity_range;
... ...
@@ -617,34 +618,31 @@ SEXP krlmmConfidenceScore(SEXP M, SEXP clustering)
617 618
       dist[i] = (double *)malloc((num_sample + 1) * sizeof(double)); 
618 619
     }
619 620
 
620
-    int cluster_count[4]; // extra element to cope with zero-base notation
621
-   // int clid[4];
621
+    long cluster_count[4]; // extra element to cope with zero-base notation
622 622
 
623 623
     for (i = 1; i <= num_SNP; i++) {
624 624
         cluster_count[1] = 0;
625 625
         cluster_count[2] = 0;
626 626
         cluster_count[3] = 0;
627 627
         for (j=1; j<= num_sample; j++){
628
-	    cluster = ptr2cluster[CMatrixElementPosition(i, j, num_SNP)];
628
+            cluster = ptr2cluster[CMatrixElementPosition(i, j, num_SNP)];
629 629
             cluster_count[cluster]++;
630 630
         }
631
-	mean_intensity = calculate_SNP_mean(i, REAL(M), num_SNP, num_sample);
632
-	intensity_range = calculate_SNP_range(i, REAL(M), num_SNP, num_sample);
631
+        mean_intensity = calculate_SNP_mean(i, REAL(M), num_SNP, num_sample);
632
+        intensity_range = calculate_SNP_range(i, REAL(M), num_SNP, num_sample);
633 633
 
634 634
         k = 0;
635 635
         for (j=1; j<=3; j++){
636 636
             if (cluster_count[j] > 0){
637 637
                 k++;
638
-            //    clid[k] = j;
639 638
             }
640 639
         }
641 640
 
642
-	if (k==1) {
643
-	  calculate_unique_cluster_scores(i, REAL(M), mean_intensity, intensity_range, num_SNP, num_sample, ptr2score);
644
-	} else {
645
-	  calculate_multiple_cluster_scores(i, REAL(M), mean_intensity, ptr2cluster, num_SNP, num_sample, ptr2score, dist, cluster_count);
646
-	}       
647
-      
641
+        if (k==1) {
642
+            calculate_unique_cluster_scores(i, REAL(M), mean_intensity, intensity_range, num_SNP, num_sample, ptr2score);
643
+        } else {
644
+	        calculate_multiple_cluster_scores(i, REAL(M), mean_intensity, ptr2cluster, num_SNP, num_sample, ptr2score, dist, cluster_count);			
645
+        }       
648 646
     } 
649 647
 
650 648
     UNPROTECT(1);