User Tools

Site Tools


perl:real_csv_parsing
sub parse_csv {
	if (!ref $_[0]) { # if given a file, open it and recurse on the handle
		my ($filename) = @_;
		open my $fp, $filename or die "$filename: $!\n";
		my $r = parse_csv($fp);
		close $fp;
		return $r;
	}
 
	my ($fp) = @_;
	my $char;
	my $status;
	# yes, we're using a single-character FSM to parse CSV in perl
	# yes, I know about split and regexes, but that doesn't help with matched quotes & embedded commas
	# yes, I know about Text::CSV, but that doesn't support embedded newlines
	# a character-based FSM is simple to implement for CSV, the same can not be said for other techniques
	my @table;
	my ($row,$col) = (0,0);
	my $state = 'NORMAL';
	my @fsm = ( # a finite state machine table
		# state      char     newstate     operation
		'NORMAL',    ","  ,  undef,      sub { $col++ },
		'NORMAL',    "\n" ,  undef,      sub { $col=0; $row++ },
		'NORMAL',    "\"" ,  'INQUOTE',   sub { },
		'NORMAL',    '',     undef,      sub { $table[$row][$col] .= $char; },
		'INQUOTE',   "\"" ,  'ENDQUOTE?', sub { },
		'INQUOTE',   '',     undef,       sub { $table[$row][$col] .= $char; },
		'ENDQUOTE?', "\"" ,  'INQUOTE',   sub { $table[$row][$col] .= $char; }, # not end quote, just a double quote
		'ENDQUOTE?', ","  ,  'NORMAL',    sub { $col++ },
		'ENDQUOTE?', "\n" ,  'NORMAL',    sub { $col=0; $row++ },
		'ENDQUOTE?', '',     'NORMAL',    sub { $table[$row][$col] .= $char; }, # we got and end quote then more stuff...weird, but just include the stuff
	);
 
	# convert fsm table to fsm hash for direct lookup
	my %fsm;
	while (@fsm) {
		my ($state,$char,$newstate,$sub) = splice(@fsm,0,4);
		$fsm{$state}{$char} = [$newstate,$sub];
	}
	#print Dumper \%fsm;
	while ($status = read($fp,$char,1)) {
		next if $char eq "\r"; # Windows CRs can just go die
		#print "$state $char: ($fsm{$state}{$char}/$fsm{$state}{''})\n";
		my ($newstate,$sub) = @{ $fsm{$state}{$char} || $fsm{$state}{''} };
		#print "($newstate,$sub) = fsm{$state}{$char};\n";
		if ($newstate) { $state = $newstate; }
		$sub && $sub->();
	}
	if (!defined($status)) { die "$!\n"; }
	return \@table;
}
perl/real_csv_parsing.txt · Last modified: 2009/08/13 09:57 by tkbletsc