#This is version 2 of the similarity computations. Whereas version 1 used the Masters et al. equation, #version 2 uses a different equation defined by us in the manuscript #Choose the input file f = file.choose(); #Read the data d=read.delim(f, colClasses=c(rep("character",3), rep("numeric", 30))) cell.lines=d[,1]#The first column of d will be cell line names dat=d[,2:33]#Columns 2 to 33 are the data for the 16 markers we will use. #Convert X and Y in the first two columns of dat to be 0 and 1 respectively temp=array(data=0, dim=dim(dat)[1]) q=which(dat[,1]=="Y") temp[q]=1 dat[,1]=temp temp=array(data=0, dim=dim(dat)[1]) q=which(dat[,2]=="Y") temp[q]=1 dat[,2]=temp #Find which row has MOLT-4 data #q=grep(cell.lines, pattern="MOLT") #Mark MOLT-4 data as missing #dat[q,]=NA #Convert dat from data.frame to matrix, so we can do operations across columns. dat=as.matrix(dat) num.lines=length(cell.lines) #Create an empty array called "similarity" of size num.lines x num.lines to hold the similarity values similarity=array(data=NA, dim=c(num.lines, num.lines)) #Give names to the rows and columns colnames(similarity)=cell.lines rownames(similarity)=cell.lines #Compute similarities for each cell line pair for(c1 in 1:num.lines) { for(c2 in 1:num.lines) { cell.line1.data=dat[c1,] cell.line2.data=dat[c2,] T1=sum(cell.line1.data) T2=sum(cell.line2.data) #Compute N, but only count instances where the STR difference is greater than 1 N=sum(abs(cell.line1.data-cell.line2.data)>1) if(!is.na(T1) & !is.na(T2) & !is.na(N))#If all these values are good { similarity[c1,c2]=((T1+T2-N)/(T1+T2))*100 } } } setwd(dirname(f)); #Write the similarity table to a CSV file write.csv(similarity, file="similarity.csv", na="")