I am trying to run a program that takes two fasta sequencing files and compares them. the program has to compare 38,000 file together to make 1 billion comparisons. when i run the program it does about 12,000 comparisons a minute. this would take 60 days to complete!
i am noticing that the program is only using 5 or 6 percent of the processor. The machine has 4 processors so i have no ideas why it uses so little power. Is there any way to speed up performance?
code:
#print "Input number?\n";
$input = $ARGV [1];
#print "output number?\n";
$output = $ARGV [2];
#print "comparison number?\n";
$file = $ARGV [3];
chomp $file;
# print "$match, $mismatch, $gap, $ext \n";
if ($helic =~ /y/) {
open (FILE, "helicoselist.txt") or die "Couldn't open location file: $!\n";
@filename = <FILE>;
$scalefile = @filename;
close (FILE);
$helifile=">/home/wjones/Shared/usbdrive/sequence/Sorted/helicose/helisheet" . $file . ".txt";
#print "helifile = $helifile\n";
open MYFILE, $helifile;
print MYFILE "Database\t","Query\t","Best Score\t","Percent\t", "Fraction\t","Line of Query Match\t", "Line of Subject Match\t","\n";
close (MYFILE);
}
else {
open (FILE, "refseqlist.txt") or die "Couldn't open location file: $!\n";
@filename = <FILE>;
$scalefile = @filename;
close (FILE);
$seqfile=">/home/wjones/Shared/usbdrive/sequence/Sorted/refseq/refsheet" . $file . ".txt";
open MYFILE, $seqfile;
print MYFILE "Database\t","Query\t","Best Score\t","Percent\t", "Fraction\t","Line of Query Match\t", "Line of Subject Match\t","\n";
close (MYFILE);
}
LINE :for ($i=$input; $i<=$output; $i++) {
for ($j=0; $j<$i; $j++) {
$database = $filename [$j];
chomp $database;
$query = $filename [$i];
chomp $query;
#print "database=$database , query=$query \n";
#`/home/wjones/Shared/usbdrive/sequence/blast-2.2.18/bin/bl2seq -i d001/NM_001083111.1 -j d001/NM_001007214.1 -p blastn -r 1 -q -1 -G 20 -E 2 > compared.txt`;
`/home/wjones/Shared/usbdrive/sequence/blast-2.2.18/bin/bl2seq -i "$query" -j "$database" -p blastn -r 1 -q -2 -G 20 -E 2 > compared"$file".txt`;
$compare = "compared" . $file . ".txt";
#print $compare, "\n";
open (COMP, $compare) or die "Couldn't open compared file\n";
$bestscore = 0;
while (<COMP>) { # sets each line to $_ for each iteration
$newscore = 0;
chomp $_;
#print "Text line: $_ \n";
if ($_ =~ / Score/) {
($var1, $score, $var3) = split /[()]/, $_; #use square brackets to parse on parentheses
#print "Var1 is: $var1 \n";
#print "The score is: $var2 \n";
#print "Var3 is: $var3\n";
$line2 = readline *COMP; #Identities
$line3 = readline *COMP; #Strand
$line4 = readline *COMP;
$line5 = readline *COMP;
$line6 = readline *COMP; #query line
$line7 = readline *COMP;
$line8 = readline *COMP; #subject line
#print "$line2";
#print "$line3";
if ($line3 =~ /Strand = Plus \/ Plus/) {
$newscore = $score;
#print "$newscore\n";
chomp $line2;
my ($dat1, $dat2, $dat3, $dat4, $dat5, $dat6) = split /[ ]/, $line2;
#print "$dat3,$dat4,$dat5\n";
($fraction, $excess1 ) = split /\//, $dat4;
#print "$fraction\n";
my ($excess2, $perctemp, $excess3) = split /[()]/, $dat5;
#print "$perctemp\n"; #percent with sign
($percent, $excess4) = split /%/, $perctemp;
#print "$percent\n";
($qu1, $queryln, $qu2, $qu3) = split / /, $line6;
#print "$queryln\n";
($qu1, $sbjctln, $qu2, $qu3) = split / /, $line8;
#print "$sbjctln\n";
}
if ($newscore > $bestscore) {
$bestscore = $newscore;
$bestfrac = $fraction;
$bestperc = $percent;
#print "$percbest, $percent";
}
}
#print "The score still is: $bestscore \n";
}
# print "The new score is: $bestcore, \n";
#print " $database\n $bestscore\n $query\n";
#print " percent: $bestperc Fraction: $bestfrac\n";
if ($helic =~ /y/) {
$helifile2 = ">" . $helifile;
open MYFILE, $helifile2;
if ($bestscore == 0) {
#print "Not valid!!\n";
print MYFILE "$database\t","$query\t","0\t","0\t", "0\t",".\t", ".\t","\n";
print MYFILE "$query\t","$database\t","0\t","0\t", "0\t",".\t", ".\t","\n";
close (MYFILE);
}
else {
print MYFILE "$database\t","$query\t","$bestscore\t","$bestperc\t", "$bestfrac\t","$queryln\t", "$sbjctln\t","\n";
print MYFILE "$query\t","$database\t","$bestscore\t","$bestperc\t", "$bestfrac\t","$queryln\t", "$sbjctln\t","\n"; # relationship is communative
close (MYFILE);
}
}
if ($helic =~ /n/) {
$seqfile2 = ">" . $seqfile;
open MYFILE, $seqfile2;
if ($bestscore == 0) {
#print "Not valid!!\n";
print MYFILE "$database\t","$query\t","0\t","0\t", "0\t",".\t", ".\t","\n";
print MYFILE "$query\t","$database\t","0\t","0\t", "0\t",".\t", ".\t","\n";
close (MYFILE);
}
else {
print MYFILE "$database\t","$query\t","$bestscore\t","$bestperc\t", "$bestfrac\t","$queryln\t", "$sbjctln\t","\n";
print MYFILE "$query\t","$database\t","$bestscore\t","$bestperc\t", "$bestfrac\t","$queryln\t", "$sbjctln\t","\n"; # relationship is communative
close (MYFILE);
}
}
}
}
print "Comparison Complete!!!!!!\n";
i am noticing that the program is only using 5 or 6 percent of the processor. The machine has 4 processors so i have no ideas why it uses so little power. Is there any way to speed up performance?
code:
#print "Input number?\n";
$input = $ARGV [1];
#print "output number?\n";
$output = $ARGV [2];
#print "comparison number?\n";
$file = $ARGV [3];
chomp $file;
# print "$match, $mismatch, $gap, $ext \n";
if ($helic =~ /y/) {
open (FILE, "helicoselist.txt") or die "Couldn't open location file: $!\n";
@filename = <FILE>;
$scalefile = @filename;
close (FILE);
$helifile=">/home/wjones/Shared/usbdrive/sequence/Sorted/helicose/helisheet" . $file . ".txt";
#print "helifile = $helifile\n";
open MYFILE, $helifile;
print MYFILE "Database\t","Query\t","Best Score\t","Percent\t", "Fraction\t","Line of Query Match\t", "Line of Subject Match\t","\n";
close (MYFILE);
}
else {
open (FILE, "refseqlist.txt") or die "Couldn't open location file: $!\n";
@filename = <FILE>;
$scalefile = @filename;
close (FILE);
$seqfile=">/home/wjones/Shared/usbdrive/sequence/Sorted/refseq/refsheet" . $file . ".txt";
open MYFILE, $seqfile;
print MYFILE "Database\t","Query\t","Best Score\t","Percent\t", "Fraction\t","Line of Query Match\t", "Line of Subject Match\t","\n";
close (MYFILE);
}
LINE :for ($i=$input; $i<=$output; $i++) {
for ($j=0; $j<$i; $j++) {
$database = $filename [$j];
chomp $database;
$query = $filename [$i];
chomp $query;
#print "database=$database , query=$query \n";
#`/home/wjones/Shared/usbdrive/sequence/blast-2.2.18/bin/bl2seq -i d001/NM_001083111.1 -j d001/NM_001007214.1 -p blastn -r 1 -q -1 -G 20 -E 2 > compared.txt`;
`/home/wjones/Shared/usbdrive/sequence/blast-2.2.18/bin/bl2seq -i "$query" -j "$database" -p blastn -r 1 -q -2 -G 20 -E 2 > compared"$file".txt`;
$compare = "compared" . $file . ".txt";
#print $compare, "\n";
open (COMP, $compare) or die "Couldn't open compared file\n";
$bestscore = 0;
while (<COMP>) { # sets each line to $_ for each iteration
$newscore = 0;
chomp $_;
#print "Text line: $_ \n";
if ($_ =~ / Score/) {
($var1, $score, $var3) = split /[()]/, $_; #use square brackets to parse on parentheses
#print "Var1 is: $var1 \n";
#print "The score is: $var2 \n";
#print "Var3 is: $var3\n";
$line2 = readline *COMP; #Identities
$line3 = readline *COMP; #Strand
$line4 = readline *COMP;
$line5 = readline *COMP;
$line6 = readline *COMP; #query line
$line7 = readline *COMP;
$line8 = readline *COMP; #subject line
#print "$line2";
#print "$line3";
if ($line3 =~ /Strand = Plus \/ Plus/) {
$newscore = $score;
#print "$newscore\n";
chomp $line2;
my ($dat1, $dat2, $dat3, $dat4, $dat5, $dat6) = split /[ ]/, $line2;
#print "$dat3,$dat4,$dat5\n";
($fraction, $excess1 ) = split /\//, $dat4;
#print "$fraction\n";
my ($excess2, $perctemp, $excess3) = split /[()]/, $dat5;
#print "$perctemp\n"; #percent with sign
($percent, $excess4) = split /%/, $perctemp;
#print "$percent\n";
($qu1, $queryln, $qu2, $qu3) = split / /, $line6;
#print "$queryln\n";
($qu1, $sbjctln, $qu2, $qu3) = split / /, $line8;
#print "$sbjctln\n";
}
if ($newscore > $bestscore) {
$bestscore = $newscore;
$bestfrac = $fraction;
$bestperc = $percent;
#print "$percbest, $percent";
}
}
#print "The score still is: $bestscore \n";
}
# print "The new score is: $bestcore, \n";
#print " $database\n $bestscore\n $query\n";
#print " percent: $bestperc Fraction: $bestfrac\n";
if ($helic =~ /y/) {
$helifile2 = ">" . $helifile;
open MYFILE, $helifile2;
if ($bestscore == 0) {
#print "Not valid!!\n";
print MYFILE "$database\t","$query\t","0\t","0\t", "0\t",".\t", ".\t","\n";
print MYFILE "$query\t","$database\t","0\t","0\t", "0\t",".\t", ".\t","\n";
close (MYFILE);
}
else {
print MYFILE "$database\t","$query\t","$bestscore\t","$bestperc\t", "$bestfrac\t","$queryln\t", "$sbjctln\t","\n";
print MYFILE "$query\t","$database\t","$bestscore\t","$bestperc\t", "$bestfrac\t","$queryln\t", "$sbjctln\t","\n"; # relationship is communative
close (MYFILE);
}
}
if ($helic =~ /n/) {
$seqfile2 = ">" . $seqfile;
open MYFILE, $seqfile2;
if ($bestscore == 0) {
#print "Not valid!!\n";
print MYFILE "$database\t","$query\t","0\t","0\t", "0\t",".\t", ".\t","\n";
print MYFILE "$query\t","$database\t","0\t","0\t", "0\t",".\t", ".\t","\n";
close (MYFILE);
}
else {
print MYFILE "$database\t","$query\t","$bestscore\t","$bestperc\t", "$bestfrac\t","$queryln\t", "$sbjctln\t","\n";
print MYFILE "$query\t","$database\t","$bestscore\t","$bestperc\t", "$bestfrac\t","$queryln\t", "$sbjctln\t","\n"; # relationship is communative
close (MYFILE);
}
}
}
}
print "Comparison Complete!!!!!!\n";