perl:real_csv_parsing
sub parse_csv { if (!ref $_[0]) { # if given a file, open it and recurse on the handle my ($filename) = @_; open my $fp, $filename or die "$filename: $!\n"; my $r = parse_csv($fp); close $fp; return $r; } my ($fp) = @_; my $char; my $status; # yes, we're using a single-character FSM to parse CSV in perl # yes, I know about split and regexes, but that doesn't help with matched quotes & embedded commas # yes, I know about Text::CSV, but that doesn't support embedded newlines # a character-based FSM is simple to implement for CSV, the same can not be said for other techniques my @table; my ($row,$col) = (0,0); my $state = 'NORMAL'; my @fsm = ( # a finite state machine table # state char newstate operation 'NORMAL', "," , undef, sub { $col++ }, 'NORMAL', "\n" , undef, sub { $col=0; $row++ }, 'NORMAL', "\"" , 'INQUOTE', sub { }, 'NORMAL', '', undef, sub { $table[$row][$col] .= $char; }, 'INQUOTE', "\"" , 'ENDQUOTE?', sub { }, 'INQUOTE', '', undef, sub { $table[$row][$col] .= $char; }, 'ENDQUOTE?', "\"" , 'INQUOTE', sub { $table[$row][$col] .= $char; }, # not end quote, just a double quote 'ENDQUOTE?', "," , 'NORMAL', sub { $col++ }, 'ENDQUOTE?', "\n" , 'NORMAL', sub { $col=0; $row++ }, 'ENDQUOTE?', '', 'NORMAL', sub { $table[$row][$col] .= $char; }, # we got and end quote then more stuff...weird, but just include the stuff ); # convert fsm table to fsm hash for direct lookup my %fsm; while (@fsm) { my ($state,$char,$newstate,$sub) = splice(@fsm,0,4); $fsm{$state}{$char} = [$newstate,$sub]; } #print Dumper \%fsm; while ($status = read($fp,$char,1)) { next if $char eq "\r"; # Windows CRs can just go die #print "$state $char: ($fsm{$state}{$char}/$fsm{$state}{''})\n"; my ($newstate,$sub) = @{ $fsm{$state}{$char} || $fsm{$state}{''} }; #print "($newstate,$sub) = fsm{$state}{$char};\n"; if ($newstate) { $state = $newstate; } $sub && $sub->(); } if (!defined($status)) { die "$!\n"; } return \@table; }
perl/real_csv_parsing.txt · Last modified: 2009/08/13 09:57 by tkbletsc