#!/usr/bin/perl -w use strict; use warnings; my @conditions; # list of conditions to be satisfied my @ofields; # list of field numbers to output my $fsep = ','; # default output field separator string; see -F option my $invert_conditions; # see -v option my $show_field_count; # see -c option my $ordinal = 0; # see -o option my $rcf; # see -f option my $codeblock_ok; # see -t option my @skipchars; # see -s option my $hdr_prefix; # see -i option my $debug_mode; # see -d option my $strict; # see -S option ## QUESTION: should I check for and process rc files in ~ & . directories?? use constant RC_NAMESPACE => 'CSV::parse_csv::RC'; use constant HEADER_PREFIX => '!'; sub option { my $opt = shift; if ( $opt =~ m/^F(.*)/ ) { # sort of similar to awk's -F $fsep = $1; # this is a kludge, but... # ... try to convert ascii 2-character escapes to value ... $fsep =~ s|\\a|\a|g; $fsep =~ s|\\b|\b|g; $fsep =~ s|\\t|\t|g; $fsep =~ s|\\n|\n|g; $fsep =~ s|\\f|\f|g; $fsep =~ s|\\r|\r|g; $fsep =~ s|\\e|\e|g; } elsif ( $opt eq 'c' ) { # output field count for each matching record $show_field_count = 1; } elsif ( $opt eq 't' ) { # enable command line subroutines $codeblock_ok = 1; } elsif ( $opt eq 'v' ) { # inverse of selector conditions $invert_conditions = 1; } elsif ( $opt eq 'h' || $opt eq '-help' ) { # help, -h or --help exec "pod2text $0"; } elsif ( $opt eq 'o' ) { # ordinal field counts $ordinal = 1; } elsif ( $opt eq 'O' ) { # zero-based field counts (default) $ordinal = 0; } elsif ( $opt =~ m/^i(.*)$/ ) { # header record line prefix $hdr_prefix = $1; } elsif ( $opt =~ m/^s(.)$/ ) { # skip records beginning with this char push @skipchars, $1; } elsif ( $opt =~ m/^f(.*)/ ) { # suck in options and subroutines from file $rcf = $1; open RCF, "<$rcf" or die qq(failed to open rc file "$rcf": $!\n); my $rcf_contents = "package " . RC_NAMESPACE . ";\n"; while ( ) { s/^option /main::option /; $rcf_contents .= $_; } close RCF; # print $rcf_contents, "\nDONE\n"; eval $rcf_contents or die qq(error in rc file "$rcf":\n$@\n); } elsif ( $opt eq 'd' ) { # debug mode $debug_mode = 1; } elsif ( $opt eq 'S' ) { # die if a name doesn't resolve $strict = 1; } else { die qq(unknown option "$opt"\n); } } ## define some useful regex patterns: my $numpat = '-?\d*.?\d+'; # should match 4, -15.3, -.05, ... my $oppat = '[=<>~]'; # matches =, >, <, ~ my $namepat = '^[A-Za-z_][A-Za-z_0-9]*'; # note: "-" not allowed while ( defined $ARGV[0] ) { my $arg = shift @ARGV; if ( my ($opt) = $arg =~ m/^-(.*)/ ) { # handle options option $opt; next; } ## numeric field specifiers: if ( $arg =~ m/^(\d+)(($oppat)($numpat))?$/ ) { my ($field, $op, $value) = ($1, $3, $4); $field-- if $ordinal; if ( defined $op ) { push @conditions, [ $field, $op, $value ]; } else { push @ofields, $field; } } ## named field specifiers: elsif ( $arg =~ m/^($namepat)(($oppat)($numpat))?$/ ) { my ($name, $op, $value) = ($1, $3, $4); ## NOTE: the following "code string" is eval'd in later scope: my $nameref = 'defer("' . lc $name . '")'; if ( defined $op ) { push @conditions, [ \$nameref, $op, $value ]; } else { push @ofields, \$nameref; } } ## numeric field range specifier (output only, no conditional version) elsif ( $arg =~ m/^(\d+)-(\d+)$/ ) { my ($from, $to) = ($1, $2); die "illegal range specifier: $arg" unless $to > $from; foreach my $field ( $from .. $to ) { $field-- if $ordinal; push @ofields, $field; } } ## function specifiers: elsif ( $arg =~ m/^($namepat)\((.*)\)(($oppat)($numpat))?$/ ) { my ($func, $fargs, $op, $value) = ($1, $2, $4, $5); ## NOTE: the following $code var is eval'd in a later scope my $code = RC_NAMESPACE . "::$func("; foreach my $arg ( split /\s*,\s*/, $fargs ) { if ( $arg =~ m/^n(\d+)$/ ) { # numeric field reference $code .= '$rec[' . $1 . '-$ordinal],'; } elsif ( $arg =~ m/^$namepat$/ ) { # named field reference $code .= '$rec[$names{$rkey}->{' . lc $arg . '}],'; } elsif ( $arg =~ m/^$numpat$/ ) { # a number $code .= $arg . ','; } else { die qq(unsupported function argument: "$arg"\n); } } $code .= ')'; if ( defined $op ) { push @conditions, [ \$code, $op, $value ]; } else { push @ofields, \$code; } } ## subroutine specifiers: elsif ( $arg =~ m/^{(.+)}(($oppat)($numpat))?$/ ) { my ($code, $op, $value) = ($1, $3, $4); die "command line code blocks {$code} not allowed\n" unless $codeblock_ok; ## NOTE: the following string substitution is eval'd in later scope: $code =~ s/\bn(\d+)/\$rec[$1-$ordinal]/g; # "n5" to "$rec[5]" $code =~ s/\bn_($namepat)/\$rec[\$names{\$rkey}->{$1}]/g; # "n_name" if ( defined $op ) { push @conditions, [ \$code, $op, $value ]; } else { push @ofields, \$code; } } # ## pattern match conditional specifier (conditional only, no output) # elsif ( $arg =~ m/^(\d+)~([\d.*^+]+)$/ ) { # my ($field, $value) = ($1, $2); # $field-- if $ordinal; # push @conditions, [ $field, '~', $value ]; # } else { die qq(argument "$arg" not understood (try: $0 --help)\n); # this might be changed to allow data file to be specified, for instance } } if ( $debug_mode ) { print "DEBUG conditions:\n"; foreach my $item ( @conditions ) { unless ( ref $item ) { print "\t$item\n"; } elsif ( ref $item eq 'ARRAY' ) { print "\t"; foreach my $element ( @$item ) { if ( ref $element eq 'SCALAR' ) { print "$$element"; } else { print "$element"; } } print "\n"; } elsif ( ref $item eq 'ARRAY' ) { print "\t@$item\n"; } elsif ( ref $item eq 'HASH' ) { print "\t%$item\n"; } elsif ( ref $item eq 'SCALAR' ) { print "\t$$item\n"; } } print "DEBUG output selectors:\n"; foreach my $item ( @ofields ) { unless ( ref $item ) { print "\t$item\n"; } elsif ( ref $item eq 'ARRAY' ) { print "\t@$item\n"; } elsif ( ref $item eq 'HASH' ) { print "\t%$item\n"; } elsif ( ref $item eq 'SCALAR' ) { print "\t$$item\n"; } } exit; } my $skipchars = join '|', @skipchars; my %names; my $rkey = "_"; # default record key into names hash my $rkeyfields = 0; # number of fields to use in record key my @rec; # defined globally to be accessible in the following sub: my $nomatch; # set to reject the current record under consideration sub defer { my $name = shift; warn qq(test $rkey $name"\n) unless exists $names{$rkey}; warn qq(field names not initialized; for \$names\{ $rkey}->\{$name}"\n) unless exists $names{$rkey}; unless ( exists $names{$rkey}->{$name} ) { die qq(field "$name" not found in record "$rkey"\n) if $strict;; $nomatch = 1; return undef; } return $rec[$names{$rkey}->{$name}]; } my %V; # global hash for user variables my $alpha; while ( <> ) { # parse data stream of records of comma-separated values chomp; s/\r$//; # for MSDOS files... next if m/^\s*$/; # allow for blank lines last if m/^\s*$/; ($alpha) = m/^([^\d.-])*/; # catch non-numeric prefix! # if ( $hdr_prefix && (my ($field_names) = m/^$hdr_prefix(.*)/ )) { if ( $hdr_prefix && s/^$hdr_prefix// ) { my @field = split /\s*,\s*/; # prep to process field names ## @field contains names (some might be null) for this record type my $key = '_'; # default value $rkeyfields = 0; foreach my $name ( @field ) { # use first numeric fields as record key last unless $name =~ m/^\d+$/; # NOTE: support only for integers! $rkeyfields++; if ( $rkeyfields == 1 ) { $key = $name; } else { # append additional key fields after comma separator... $key .= ',' . $name; } } ## ASSERT: $key is either "_" or a numeric value $names{$key} = {}; # init empty hash for names my $i = -1; foreach my $name ( @field ) { $i++; next unless defined $name; $name = lc $name; $names{$key}->{$name} = $i; } ## ASSERT: one or more maps from name to index number is defined # next; } next if $skipchars && m/^$skipchars/; # my @rec = split ","; @rec = split ","; my @output; undef $nomatch; unless ( $rkeyfields ) { $rkey = '_'; } else { # set the record key for this particular record... $rkey = $rec[0]; for ( my $i=1; $i<$rkeyfields; $i++ ) { $rkey .= ',' . $rec[$i]; } } ## rkey is either "_" or set to the first field(s) of this record foreach my $cond ( @conditions ) { my ( $field, $op, $value) = @$cond; my $test; if ( ref $field ) { # subroutine conditional $test = eval $$field; die qq(error in "$field": $@\n) if $@; } else { die "condition field number $field not present in data" if $field > $#rec; $test = $rec[$field]; } $nomatch=1 && last unless defined $test; $nomatch=1 && last if $test eq ''; last if $alpha; if ( $op eq '=' ) { $nomatch = $test != $value; } elsif ( $op eq '<' ) { $nomatch = $test >= $value; } elsif ( $op eq '>' ) { $nomatch = $test <= $value; } elsif ( $op eq '~' ) { $nomatch = $test !~ m/$value/; } else { die qq(unknown operator: "$op"\n); } last if $nomatch; } next if ( !$nomatch && $invert_conditions ) || ( $nomatch && !$invert_conditions ); push(@output, scalar @rec . ':') if $show_field_count; foreach my $field ( @ofields ) { if ( ref $field ) { # subroutine, function, or name to resolve... my $result = eval $$field; die qq(error in "$$field": $@\n) if $@; $nomatch = not defined $result; push @output, $result; } elsif ( $field > $#rec ) { # not enough fields in record... # my $fnum = $field + $ordinal; # my $total = @rec; # die "no field number $fnum of $total fields"; $nomatch = 1; } else { # $field is a valid index number into record push @output, $rec[$field]; } last if $nomatch; } next if $nomatch; unless ( @ofields ) { # default: output all fields if none are specified foreach my $field_value ( @rec ) { push @output, $field_value; } } $output[0] = $alpha . $output[0] if $alpha; print join($fsep, @output), "\n" if @output; } =head1 NAME parse-csv -- parse, filter, and select fields from simple CSV files =head1 SYNOPSIS $ parse_csv 2-5 10 0=119 2\>200 < input.dat This command outputs fields 2, 3, 4, 5, and 10 for records meeting the conditions that field 0 has value 119 and field 2 is greater than the value 200. =head1 DESCRIPTION The input stream of records of comma-separated numerical fields is processed in two stages. First, comparison operators select which records are considered for output. Second, output specifiers select which fields are output. =head1 CONDITIONS =over 4 =item fnum=value =item fnumvalue Test for equality or relative numeric magnitude of the specified field value. NOTE! These condition operators will need to be quoted somehow to prevent the shell from seeing them as redirection operators. So, for instance, one could use 5\<1500 or '4<500'. =item fnum~pattern Test for regular expression match (using perl's regexes) with the specified field. =item {code}=value =item {code}>value =item {code}value =item f() that code blocks and function calls involve evaluating a string containing the code for each record, so may be considerably slower than straight field comparisons. =head1 OUTPUT SELECTORS Fields are numbered from 0 by default (see L<-o> option). The following selector forms are supported: =over =item field index number Fields are indexed with integer values, 0-based by default (also see -O option) or 1-based (see -o). E.g., the value 5 would select field number 5. =item field range specifier Two integer values separated by "-" map to all fields including and between those values. E.g., 3-7, outputs fields 3,4,5,6,7. The second value must be greater than the first. =item code block Perl code inside curly braces, {...}, is evaluated for one or more output values. Fields are referenced by prefixing the character "n" to the field index number. Note that code blocks may need to be quoted against expansion by the shell. E.g., {n5*100}. =item function call Predefined functions, e.g., from an rc file, can be called using the syntax foo(...), where arguments are either number or field specifiers with the "n" prefix. Note that function calls will need to be quoted if parentheses are interpreted by the shell. =back =head1 OPTIONS =over 4 =item -Fsep Specify separate character or string. Default is comma, ",". Standard unix escapes (\t, \n, etc.) are laboriously supported. =item -v Invert conditions test -- i.e., match records and then do the opposite. =item -c Show field count (number of fields in matched records) as NUMBER: at the beginning of each output line. =item -o Use ordinal numbers for array elements, i.e., first field is 1. The default is to number from zero. =item -O Use zero-based arrays (default). =item -h Display this help (execs the pod2text utility). =item -fFILE Incorporate rc file "FILE" into the runtime environment, mainly for options and user-defined subroutines. Options may be set using the provided option() subroutine, so that they don't need to be specified on the command line. =item -t Enable command line conditional and output code blocks. Suggests Perl's -T taint-check option, which would be important to use if parse-csv is unleashed in a CGI application, for instance. =item -iSTRING[NUM] Define unique identifier string and optional number of fields to indicate or key a header definition line. The key sequence must occur at the beginning of the line. =item -sC Skip records beginning with character C, e.g., C<-s#> or C<-s;>. B: the C<-i> option takes precedence over C<-s>. =item -d Set debug mode on: parse command line, display resulting structures, and quit. =item -S Choose "strict" option: die if a field name doesn't resolve. =back =head1 SUBROUTINES and FUNCTIONS The left-side of conditional constructs, and all output specifiers, are tokens which refer to one or more fields in the records being processed. These tokens can be replaced by expressions in curly brackets containing perl code, and function calls. Here's an expression which combines year and day-of-year into a single value: {n2+n3/365} This could be used in place of an output or a conditional token. $ parse-csv 2003.5 {n2+n3/365} 11 The following macro would do the same thing, given the definition following: yd2y(2,3) $ parse-csv 2003.5 yd2y(2,3) 11 sub yd2y { $y=shift; $d=shift; $y+$d/365 } or perhaps sub yd2y {$_[0]+$_[1]/365} B that code blocks and function calls on the command line may need to be escaped to avoid shell expansion. This would include protecting any $ or redirection characters, for instance, and in the case of functions, protecting the parentheses against shell interpretation. =head1 IMPLEMENTATION The program has two phases. First the command line is parsed, processing conditions and output specifiers into separate lists. A confusing aspect of this processing is the defining of perl code in strings for later evaluation. In these cases, variables are used that are not yet defined; the actual context is found later, in the next phase of the program. The second phase processes each line of the data stream. For each record, all conditions are evaluated, and failure of any condition rejects the record. Field names might be read and updated from the data stream, if so configured. Each record surviving the conditions is then subject to output according to the output specifiers. Besides direct conditions and output specifiers using numeric index numbers for the fields, support has been added for pre-defined functions, on-the-fly subroutines on the command line, and field references by name. The subroutines might be dangerous, so must be enabled by a switch (C<-t>); the user should consider using Perl's I facility if this feature is used for web queries, etc. I feel that functions and other features are safe, since they must be defined separately in another file, and take only numbers or field references for arguments. Data fields are referenced in a few different ways and contexts, and this may present some confusion. Most basic are integers indexing the fields directly. For subroutines and functions, though, the prefix "n" is applied to the field index number; this is necessary since general numbers are also allowed. Field names also have two different forms, again, bare or prefixed. I names, i.e., just the name itself, apply to command line conditionals and output specifiers, and also to functions, since there is no ambiguity in these cases. Subroutines, however, must support more or less arbitrary perl, so field names are treated like perl constants (use constant ...), but with the prefix "n_" applied. =head1 BUGS Data file is currently limited to STDIN. The comma-separated values file must have strictly numeric values, and must be in the simplest CSV format (i.e., no quoted commas). Executed field number specifiers exceeding the quantity in any record will be a fatal error. However, failed conditionals prevent further checking in a record, so out of range field numbers may not cause an error. (Bug or feature?) The -v condition inversion option works on all conditions; perhaps it would be useful to have finer grained "inverse" conditions, such as fnum!=value and fnum!~pattern... Should maybe use GetOpt or similar to properly handle options. The pattern condition operator will need tweaking to support things like negative numbers, etc.. I haven't used this feature much (or at all), so maybe it's just extraneous fluff and should be removed... =head1 AUTHOR Ken Irving, fnkci@uaf.edu