-
Notifications
You must be signed in to change notification settings - Fork 55
/
Copy pathkcolumn.pl
executable file
·506 lines (417 loc) · 20.4 KB
/
kcolumn.pl
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
#!/usr/bin/env perl
use warnings;
use strict;
use Carp;
use Pod::Usage;
use Getopt::Long;
our $VERSION = '$Revision: 35 $';
our $LAST_CHANGED_DATE = '$LastChangedDate: 2008-10-03 08:57:47 -0700 (Fri, 03 Oct 2008) $';
our ($verbose, $help, $man);
our ($inputfile, $operator, @number, %number, $numfile, $reverse, $tolerate, $tab, $heading, $output, $start_split, $end_split, $name_by_header,
$topleft, $wordchar, $stopchar, $beforestring, $dryrun, $filenameout);
GetOptions ('verbose'=>\$verbose, 'help'=>\$help, 'man'=>\$man, 'numfile=s'=>\$numfile, 'reverse'=>\$reverse, 'tolerate'=>\$tolerate, 'tab'=>\$tab,
'heading=i'=>\$heading, 'output=s'=>\$output, 'start_split=i'=>\$start_split, 'end_split=i'=>\$end_split, 'name_by_header'=>\$name_by_header,
'topleft=s'=>\$topleft, 'wordchar=s'=>\$wordchar, 'stopchar=s'=>\$stopchar, 'dryrun'=>\$dryrun, 'filenameout=s'=>\$filenameout,
'beforestring=s'=>\$beforestring) or pod2usage ();
$help and pod2usage (-verbose=>1, -exitval=>1, -output=>\*STDOUT);
$man and pod2usage (-verbose=>2, -exitval=>1, -output=>\*STDOUT);
@ARGV or pod2usage (-verbose=>0, -exitval=>1, -output=>\*STDOUT);
@ARGV >= 2 or pod2usage ("Syntax error");
($inputfile, $operator, @number) = @ARGV;
if (defined $numfile) {
@number and pod2usage ("Error in argument: number already defined in command line, so no need to use --numfile option");
open (NUMBER, $numfile) or confess "Error opening numfile $numfile: $!";
$_ = <NUMBER>;
s/[\r\n]+$//;
s/^\s+|\s+$//g;
@number = split (/\s+/, $_);
close (NUMBER);
} else {
@number or pod2usage ("Error in argument: you must specify at least one column");
}
if (defined $wordchar) {
length ($wordchar) == 1 or pod2usage ("Error in argument: --wordchar <$wordchar> can only be a single character/letter");
$stopchar || $beforestring and pod2usage ("Error in argument: please do not specify --stopchr or --beforestring when --wordchar is specified");
}
if (defined $stopchar) {
$beforestring and pod2usage ("Error in argument: please do not specify --beforestring when --wordchar is specified");
length ($stopchar) == 1 or pod2usage ("Error in argument: --stopchar <$stopchar> can only be a single character/letter");
defined $wordchar and pod2usage ("Error in argument: --stopchar override the --wordchar option");
if ($stopchar eq '.' or $stopchar eq '(' or $stopchar eq ')' or $stopchar eq '[' or $stopchar eq ']' or $stopchar eq '{' or $stopchar eq '}' or $stopchar eq '$' or $stopchar eq '^' or $stopchar eq '|' or $stopchar eq '*' or $stopchar eq '+' or $stopchar eq '?' or $stopchar eq "\\") {
$stopchar = "\\" . $stopchar;
}
}
if (defined $beforestring) {
$beforestring =~ s/\./\\./g;
$beforestring =~ s/\-/\\-/g;
$beforestring =~ s/\?/\\?/g;
$beforestring =~ s/\+/\\+/g;
$beforestring =~ s/\*/\\*/g;
$beforestring =~ s/\$/\\\$/g; #special handling here required
$beforestring =~ s/\^/\\^/g;
$beforestring =~ s/\|/\\|/g;
$beforestring =~ s/\(/\\(/g;
$beforestring =~ s/\)/\\)/g;
$beforestring =~ s/\[/\\[/g;
$beforestring =~ s/\]/\\]/g;
$beforestring =~ s/\{/\\{/g;
$beforestring =~ s/\}/\\}/g;
$verbose and print STDERR "NOTICE: Changing --beforestring argument to $beforestring\n";
}
$operator eq 'f' and $operator = 'fetch';
$operator eq 'a' and $operator = 'append';
$operator eq 's' and $operator = 'split';
$operator eq 'as' and $operator = 'appendstring';
$heading ||= 0;
my @old_number;
for (@number) {
if (/^(\d+)-(\d+)$/) {
push @old_number, ($1 .. $2);
} else {
push @old_number, $_;
}
}
@number = @old_number;
if ($operator eq 'split') {
@number == 1 or pod2usage ("Error in argument: when operator is split, only one number is allowed that specifies the number of columns in each output");
$output or pod2usage ("Error in argument: please specify the --output argument for the split operation");
}
#get the maximum column so that later we can check whether the input line has sufficient number of columns
@old_number = sort {$b <=> $a} @number;
if ($operator ne 'appendstring') {
for (@number) {
m/^\d+$/ or pod2usage ("<number> should be a positive integer or zero, but your specified <$_>");
$number{$_} = 1; #a hash storing all the column numbers
$reverse and $_ eq '0' and pod2usage ("Error in argument: you cannot use column 0 together with the --reverse option");
}
}
if ($inputfile eq 'stdin') {
$operator eq 'append' and pod2usage ("Error in argument: you cannot append to stdin using input from stdin");
*FH = *STDIN;
} else {
open (FH, $inputfile) || confess "Error: cannot read from inputfile $inputfile: $!";
}
my (@fhout, @fileout, @heading); #file handle for a bunch of output files (when operator is 'split')
while (my $nextline = <FH>) {
$nextline =~ s/[\r\n]+$//;
my @content;
if ($tab) {
@content = split (/\t/, $nextline); #split the line by the tab character
} else {
$nextline =~ s/^\s+|\s+$//g;
@content = split (/\s+/, $nextline); #split the line by any blank character, and delete all heading and trailing spaces
}
my $newline = '';
if ($operator eq 'append') {
$newline = <STDIN>;
defined $newline or confess "Error: STDIN has less lines than those in $inputfile";
$newline =~ s/\s*[\r\n]+$//;
}
if ($operator eq 'append' or $operator eq 'fetch') {
if ($reverse) { #only print those unspecified column (reverse column selection)
for my $i (0 .. @content-1) {
$number{$i+1} or $newline .= "\t$content[$i]";
}
} else {
for my $i (0 .. @number-1) {
if ($number[$i]) {
if ($number[$i] > @content) {
$tolerate or confess "Error: too few columns (${\(scalar @content)}) in input line <$nextline> for column $number[$i]";
$newline .= "\t";
} else {
$newline .= "\t" . $content[$number[$i]-1];
}
} else {
$newline .= "\t" . join ("\t", @content);
}
}
}
$newline =~ s/^\t//;
$newline .= "\n";
print $newline;
} elsif ($operator eq 'appendstring') {
print $nextline, "\t", join ("\t", @number), "\n";
} elsif ($operator eq 'split') {
if ($topleft and not @fhout) { #replace the top left cell of the file by this string
$content[0] = $topleft;
}
if ($heading) {
@heading = splice (@content, 0, $heading);
}
if ($end_split) { #delete anything after --end_split
if (@content >= $number[0]*$end_split) {
splice (@content, $number[0]*$end_split);
}
}
if ($start_split) { #delete anything before --start_split
if (@content <= $number[0] * ($start_split-1)) {
confess "Error: --start_split argument is too small, as there are only ${\(scalar @content)} columns in the row";
}
splice (@content, 0, $number[0]*($start_split-1));
}
if (not @fhout) { #reading the first line of the input stream
@content % $number[0] == 0 or print STDERR "WARNING: the input contains ", scalar (@content)+$heading, " columns, by excluding $heading heading columns, the rest cannot be split to $number[0] whole parts. The last file will contain fewer columns!!!\n";
my $num_fh = int (@content / $number[0]);
@content % $number[0] == 0 or $num_fh++; #one additional file will be printed but it will have only partial columns
my (%opened_filename); #to prevent files from having same name (sometimes header line contains same samples ID)
for my $i (1 .. $num_fh) {
#if ($start_split) {
# $i += ($start_split-1);
#}
my ($temp_fh, $temp_output);
if ($name_by_header) { #output file name is based on the corresponding column in header line
my $firstword;
if ($wordchar) {
$content[($i-1)*$number[0]] =~ m/^([\w$wordchar]+)/ or confess "Error: --name_by_header argument cannot be applied, since the header (" . $content[($i-1)*$number[0]] . ") must start with words or --wordchar <$wordchar>\n";
$firstword = $1;
} elsif ($stopchar) {
$content[($i-1)*$number[0]] =~ m/^([^$stopchar]+)/ or confess "Error: --name_by_header argument cannot be applied, since the header (" . $content[($i-1)*$number[0]] . ") must start with letters other than --stopchar <$stopchar>\n";
$firstword = $1;
if ($firstword =~ m/\s/) {
print STDERR "WARNNING: Sample name $firstword contains space: change to underline instead\n";
$firstword =~s/\s/_/g;
}
} elsif ($beforestring) {
$content[($i-1)*$number[0]] =~ m/(.+?)$beforestring/ or confess "Error: --beforestring argument cannot be applied, since the header (" . $content[($i-1)*$number[0]] . ") must contain the string ($beforestring) that you specified\n";
$firstword = $1;
if ($firstword =~ m/\s/) {
print STDERR "WARNNING: Sample name $firstword contains space: change to underline instead\n";
$firstword =~s/\s/_/g;
}
} else {
$content[($i-1)*$number[0]] =~ m/^(\w+)/ or confess "Error: --name_by_header argument cannot be applied, since the header (" . $content[($i-1)*$number[0]] . ") must start with words";
$firstword = $1;
}
if ($opened_filename{$firstword}) {
while (1) {
-f "$output.$firstword" . "_R$opened_filename{$firstword}" or last; #make sure such file does not exist in current directory (otherwise they will be overwritten)
$opened_filename{$firstword}++;
}
$temp_output = "$output.$firstword" . "_R$opened_filename{$firstword}";
print STDERR "WARNING: Multiple samples with same identifier $firstword found: file renamed as $temp_output\n";
} else {
$temp_output = "$output.$firstword";
if (-f $temp_output) {
$opened_filename{$firstword} = 1;
while (1) {
-f "$output.$firstword" . "_R$opened_filename{$firstword}" or last; #make sure such file does not exist in current directory (otherwise they will be overwritten)
$opened_filename{$firstword}++;
}
print STDERR "WARNING: A file named $temp_output already exists: rename the output file to $output.${firstword}_R$opened_filename{$firstword}\n";
$temp_output = "$output.$firstword" . "_R$opened_filename{$firstword}";
}
}
$opened_filename{$firstword}++; #add one to the opened file names
} else {
my $file_index = $i;
$start_split and $file_index += ($start_split-1);
$temp_output = "$output.split$file_index";
}
eval {
open ($temp_fh, ">$temp_output") or die "$!\n";
};
if ($@) {
print STDERR "ERROR: cannot open $temp_output for writting after opening ${\(scalar @fhout)} files: $@";
print STDERR "POSSIBLE SOLUTION: try to use --start_split and --end_split argument to limit the number of files to be processed at once\n";
for my $i (0 .. @fhout-1) {
close ($fhout[$i]); #close all opened file handles
unlink ($fileout[$i]); #delete all opened files
}
exit (100);
}
push @fhout, $temp_fh;
push @fileout, $temp_output;
}
if ($name_by_header) {
print STDERR "NOTICE: splitted content will be written to ${\(scalar @fhout)} files with name starting as $output and ending as the sample name (as specified in first line of input file)\n";
} elsif ($start_split) {
print STDERR "NOTICE: splitted content will be written to files $output.split$start_split to $output.split", $start_split+$num_fh-1, "\n";
} else {
print STDERR "NOTICE: splitted content will be written to files $output.split1 to $output.split$num_fh\n";
}
if ($dryrun) {
for my $i (0 .. @fhout-1) {
close ($fhout[$i]); #close all opened file handles
unlink ($fileout[$i]); #delete all opened files
}
}
if ($filenameout) {
open (FILENAME, ">$filenameout") or confess "Error: cannot write to file name out $filenameout\n";
print FILENAME join ("\n", @fileout), "\n";
close (FILENAME);
}
if ($dryrun) {
print STDERR "NOTICE: dry run of the program finished with ${\(scalar @fileout)} output files\n";
exit (200);
}
}
for my $i (0 .. @fhout-1) {
my $temp_fh = $fhout[$i];
$heading and print $temp_fh join ("\t", @heading), "\t";
if ($i*$number[0]+$number[0] <= @content) {
print $temp_fh join ("\t", @content[($i*$number[0]) .. ($i*$number[0]+$number[0]-1)]), "\n";
} else {
print $temp_fh join ("\t", @content[($i*$number[0]) .. (@content-1)]), "\n";
}
}
} else {
pod2usage ("Error in argument: operator can only be append or fetch");
}
}
if ($operator eq 'append') {
$_ = <STDIN>;
defined $_ and print STDERR "WARNING: STDIN has more lines than those in $inputfile";
}
=head1 SYNOPSIS
kcolumn.pl <inputfile> <operator> <operated-numbers>
Optional arguments:
-v, --verbose use verbose output
-h, --help print help message
-m, --man print complete documentation
-n, --numfile <string> file containing operated number
-r, --reverse reverse the column selection
--tolerate tolerate lines with too few columns
--tab use tab as field separator (default=space+tab)
--heading <int> number of heading columns
--output <string> output filename for split operation
--start_split <int> start processing file splitting from this file index
--end_split <int> end processing file splitting after this file index
--name_by_header name outputfiles by the first word in header columns
--topleft <string> replace the top left cell by specified string
--wordchar <characters> treat the characters as word in file naming
--stopchar <characters> all characters before this char is used in file naming
--beforestring <string> all characters before this string is used in file naming
--dryrun test program for number of simultaneous files without output
--filenameout <file> write all splitted file names to this file sequentially
Function: perform column extraction operations (fetch or append or split) on
inputfile. All column index start from 1, not zero. Zero means "all columns" in this
program.
Example: kcolumn.pl inputfile3 fetch 1 3
cat inputfile2 | kcolumn.pl inputfile3 append 0
kcolumn.pl inputfile split 3 -head 3 -tab
=head1 OPTIONS
=over 8
=item B<--help>
print a brief help message and exit
=item B<--man>
print the complete manual of how to use the program
=item B<--verbose>
use verbose output
=item B<--numfile>
specify a file containing numbers that is operated by the operator
=item B<--reverse>
reverse the column selection, so only those columns that are not
selected are printed
=item B<--tolerate>
Tolerate input lines that have insufficient columns (normally this
program will throw an error when such line is encountered).
=item B<--heading>
number of heading columns. "heading column" refers to the first a few columns in
each line that will be retained in all splitted output files.
=item B<--start_slit>
specify the starting index of splitting files. When there are too much columns
in the inputfile so that split file is more than 1020, many operating systems
will fail to open new file handles. Specifying the start_split and end_split can
deal with this problem. For example, an inputfile contains 300 columns, with 3
heading columns and 10 columns per splitted file, specifying --start_split as
100 and --end_split as 200 will generate splitted files for the 1004-20003
columns.
=item B<--end_split>
specify the ending index of splitting files.
=item B<--name_by_header>
name the splitted output files by the corresponding column in the header line,
as opposed to name the outputfiles by numbers.
=item B<--topleft>
replace the top left cell of the file by this specified string, since in many
cases an input file does not have a top-left cell (the first word in a file)
=item B<--wordchar>
treat the specified characters/letters as word characters, for use together with
the -- name_by_header argument in output file naming. By default, only
alphabetical characters and underline characters are treated as word characters,
and only the first word in each column is used as the output file name.
=item B<--stopchar>
treat the specified characters as stop characters, for use together with the --
name_by_header argument in output file naming. By default, only alphabetical
characters and underline characters are treated as word characters and are used
in file naming, but once --stopchar is specified, anything other than this
character will be treated as a word, and such word will be used as part of the
output file name.
=back
=head1 DESCRIPTION
This program is used to perform various column operations on a text file. Column
is defined as text strings separated by empty character (either space or tab or
return) or tab character in each line (when -tab argument is set). Currently three
operations are supported: fetch, append and split. In the future, more column
operations will be implemented and added to this program.
The "fetch" operation retrieves specific columns from a file and displays the
content. The "append" operation fetched columns from an inputfile and then
append the content to STDIN stream. The "split" operation split the inputfile
into several output files, each containing certain columns in the input file.
The operated-numbers (column specifiers) can be expressed in two formats: (1) a
list of numbers such as "1 3 7" (2) a range of numbers such as "1 3-5 8-10"
(equivalent to "1 3 4 5 8 9 10"). Alternatively, you can put all numbers in a
single line in a file separated by spaces, and use the --numfile option to
specify the file to the program. ALL NUMBERS MUST BE POSITIVE INTEGER OR ZERO,
AND ZERO REPRESENTS ALL COLUMNS.
=head2 B<fetch operation>
The fetch operation is identical to the "cut" program in many Linux
distributions. However, by default the blank character (such as space and tab)
is used in defining columns, instead of the tab character which is used in the
cut program. In addition, the cut program can only display columns in sorted
order, but kcolumn.pl program can be very flexible to display desired columns in
specified order. For example, the content of a file called "inputfile" is below:
11 10 red
12 22 yellow
13 37 green
14 50 black
15 70 blue
When using the command
kcolumn.pl inputfile f 3 1
the output will be:
red 11
yellow 12
green 13
black 14
blue 15
=head2 B<append operation>
The append operation is used to fetch and append to a data stream received from
STDIN. For example, using the command
perl -e 'print "good\n" for (1..5)' | kcolumn.pl inputfile a 3
will generate the output:
good red
good yellow
good green
good black
good blue
The first column in each line (good) is printed from the first command, while
the second column in each line is retrieved from the third column of the
inputfile and appended to the output line.
=head2 B<split operation>
The split operation is used to split an input file to multiple files, each file
contains certain columns from the input file. For example, suppose the input
file contains 10 columns, and the first column represent the "heading" column
that should be kept in each individual file. Now if we want to split the
inputfile into three separate files, we can use
kcolumn.pl inputfile split 3 -heading 1 -out newfile
The resulting three files (file names are newfile.split1, newfile.split2 and
newfile.split3, respectively) will each contain four columns from the inputfile,
including one heading column.
When the --name_by_header argument is set, the output file names will start with
newfile, but with a prefix that is determined by the header line (first line) of
the inputfile. For the example above, the output file names will be newfile.11,
newfile.10 and newfile.red.
When the inputfile contains many columns to be splitted into >1000 individual
files, usually the operating system will complain that there are too many open
file handles. In this case, one can use the --start_split and --end_split
argument to overcome this limitation and process selected columns. For example,
for an input file that will generate 2500 individual files after splitting, one
can use:
kcolumn.pl inputfile split 3 -start 1 -end 1000 -out newfile
kcolumn.pl inputfile split 3 -start 1001 -end 2000 -out newfile
kcolumn.pl inputfile split 3 -start 2001 -end 3000 -out newfile
The above three commands will generate the 2500 individual files (note that it
is okay to use higher number of --end_split than the actual number of columns,
as the program will figure the correct column automatically).
=head2 B<other column operations>
In the future, more column operations will be added to this program.
=cut