What it does:
runs through a file, sorts, and splits it alphabetically into files with words that are equal or less than 200.
In this example I'm using the dict file, and I use it several times, so excuse the redunancy since I didn't want to include the actual source files.
It works, but it takes a very long time.
Can someone help me get it working faster? Thanks much.
Gzip'd file:
Code:
runs through a file, sorts, and splits it alphabetically into files with words that are equal or less than 200.
In this example I'm using the dict file, and I use it several times, so excuse the redunancy since I didn't want to include the actual source files.
It works, but it takes a very long time.
Can someone help me get it working faster? Thanks much.
Gzip'd file:
Code:
Code:
#!/usr/bin/perl -w
use Text::CSV;
use File::Copy;
use File::stat;
use POSIX qw(strftime);
my $green_dir = "tmp/list/green";
my $file_green = "tmp/all_green.txt";
my $blue_dir = "tmp/list/blue";
my $file_blue = "tmp/all_blue.txt";
my $green_blue_dir = "tmp/list/green_blue";
my $file_green_blue = "tmp/all_green_blue.txt";
my $allelse_dir = "tmp/list/allelse";
my $file_allelse = "tmp/allelse.txt";
my $all_dir = "tmp/list/all";
my $file_all = "tmp/all.txt";
my $max = 200;
$cnt = 0;
$cnt2 = 2;
sub rem_green {
$buf = "rm -f $green_dir/*";
system($buf);
}
sub rem_blue {
$buf = "rm -f $blue_dir/*";
system($buf);
}
sub rem_green_blue {
$buf = "rm -f $green_blue_dir/*";
system($buf);
}
sub rem_allelse {
$buf = "rm -f $allelse_dir/*";
system($buf);
}
sub rem_all {
$buf = "rm -f $all_dir/*";
system($buf);
}
@files = (
'A', 'C', 'B', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L',
'M', 'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X',
'Y', 'Z', '0', '1', '2', '3', '4', '5', '6', '7', '8', '9'
);
@fl = (
'A', 'C', 'B', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L',
'M', 'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X',
'Y', 'Z', '0', '1', '2', '3', '4', '5', '6', '7', '8', '9'
);
if ( $ARGV[0] eq "-o" ) {
$file_name = $file_blue;
$file_dir = $blue_dir;
rem_blue();
}
if ( $ARGV[0] eq "-p" ) {
$file_name = $file_green;
$file_dir = $green_dir;
rem_green();
}
if ( $ARGV[0] eq "-po" ) {
$file_name = $file_green_blue;
$file_dir = $green_blue_dir;
rem_green_blue();
}
if ( $ARGV[0] eq "-a" ) {
$file_name = $file_all;
$file_dir = $all_dir;
rem_all();
}
if ( $ARGV[0] eq "-ae" ) {
$file_name = $file_allelse;
$file_dir = $allelse_dir;
rem_allelse();
}
$files_cnt = 0;
$fl_cnt = 0;
foreach (@files) {
$file = $_;
$file .= "_1.txt";
chomp($file);
unlink("$file_dir/$file");
open( IN, "<", "$file_name" );
while (<IN>) {
$word = $_;
chomp($word);
foreach (@fl) {
$fl = $_;
chomp($fl);
if ( $word =~ /^[$fl]/ && $file =~ /^[$fl]/ ) {
if ( $cnt == $max ) {
$file =~ s/_.*//;
$file .= "_$cnt2.txt";
$cnt2++;
$cnt = 0;
unlink("$file_dir/$file");
}
print "$file_dir $file\n";
open( OUT, ">>", "$file_dir/$file" ) or die $!;
print OUT "$word\n";
close(OUT);
$cnt++;
$fl_cnt++;
}
$fl_cnt = 0;
}
$files_cnt++;
}
$files_cnt = 0;
$cnt = 0;
$cnt2 = 2;
}
close(IN);