CSV Parsing: The CSV Parser
Take a look at the implementation of the CSV parser in our example application.
We'll cover the following...
The CSV parser
We can now move on to implementing a CSV parser. Here is a possible implementation:
-module(bday_csv).-export([encode/1, decode/1]).%% @doc Take a list of maps with the same keys and transform them%% into a string that is valid CSV, with a header.-spec encode([map()]) -> string().encode([]) -> "";encode(Maps) ->Keys = lists:join(",", [escape(Name) || Name <- maps:keys(hd(Maps))]),Vals = [lists:join(",", [escape(Field) || Field <- maps:values(Map)])|| Map <- Maps],lists:flatten([Keys, "\r\n", lists:join("\r\n", Vals)]).%% @doc Take a string that represents a valid CSV data dump%% and turn it into a list of maps with the header entries as keys-spec decode(string()) -> list(map()).decode("") -> [];decode(CSV) ->{Headers, Rest} = decode_header(CSV, []),Rows = decode_rows(Rest),[maps:from_list(lists:zip(Headers, Row)) || Row <- Rows].
Note: Decoding is done by fetching the headers, then fetching all of the rows. A header line is parsed by reading each column name one at a time, and a row is parsed by reading each field one at a time.
First, there’s the public interface with two functions:
encode/1
decode/1
.
The functions are fairly straightforward, delegating the more complex operations to private helper functions. Let’s start by looking at those helping with encoding:
%%%%%%%%%%%%%%%%%% PRIVATE %%%%%%%%%%%%%%%%%%%% @private return a possibly escaped (if necessary) field or name-spec escape(string()) -> string().escape(Field) ->case escapable(Field) oftrue -> "\"" ++ do_escape(Field) ++ "\"";false -> Fieldend.%% @private checks whether a string for a field or name needs escaping-spec escapable(string()) -> boolean().escapable(String) ->lists:any(fun(Char) -> lists:member(Char, [$",$,,$\r,$\n]) end, String).%% @private replace escapable characters (only `"') in CSV.%% The surrounding double-quotes are not added; caller must add them.-spec do_escape(string()) -> string().do_escape([]) -> [];do_escape([$"|Str]) -> [$", $" | do_escape(Str)];do_escape([Char|Rest]) -> [Char | do_escape(Rest)].
If a string is judged to need escaping (according to escapable/1
), then the string is wrapped in double quotes ("
) and all double quotes inside of it are escaped with another double quote. With this, encoding is covered. Next, there are decoding’s private functions:
%% @private Decode the entire header line, returning all names in order-spec decode_header(string(), [string()]) -> {[string()], string()}.decode_header(String, Acc) ->case decode_name(String) of{ok, Name, Rest} -> decode_header(Rest, [Name | Acc]);{done, Name, Rest} -> {[Name | Acc], Rest}end.%% @private Decode all rows into a list.-spec decode_rows(string()) -> [[string()]].decode_rows(String) ->case decode_row(String, []) of{Row, ""} -> [Row];{Row, Rest} -> [Row | decode_rows(Rest)]end.%% @private Decode an entire row, with all values in order-spec decode_row(string(), [string()]) -> {[string()], string()}.decode_row(String, Acc) ->case decode_field(String) of{ok, Field, Rest} -> decode_row(Rest, [Field | Acc]);{done, Field, Rest} -> {[Field | Acc], Rest}end.%% @private Decode a name; redirects to decoding quoted or unquoted text-spec decode_name(string()) -> {ok|done, string(), string()}.decode_name([$" | Rest]) -> decode_quoted(Rest);decode_name(String) -> decode_unquoted(String).%% @private Decode a field; redirects to decoding quoted or unquoted text-spec decode_field(string()) -> {ok|done, string(), string()}.decode_field([$" | Rest]) -> decode_quoted(Rest);decode_field(String) -> decode_unquoted(String).
Decoding is done by fetching the headers, then fetching all of the rows. A header line is parsed by reading each column name one at a time, and a row is parsed by reading each field one at a time. At the end we can see that both fields and names are actually implemented as quoted or unquoted strings:
%% @private Decode a quoted string-spec decode_quoted(string()) -> {ok|done, string(), string()}.decode_quoted(String) -> decode_quoted(String, []).%% @private Decode a quoted string-spec decode_quoted(string(), [char()]) -> {ok|done, string(), string()}.decode_quoted([$"], Acc) -> {done, lists:reverse(Acc), ""};decode_quoted([$",$\r,$\n | Rest], Acc) -> {done, lists:reverse(Acc), Rest};decode_quoted([$",$, | Rest], Acc) -> {ok, lists:reverse(Acc), Rest};decode_quoted([$",$" | Rest], Acc) -> decode_quoted(Rest, [$" | Acc]);decode_quoted([Char | Rest], Acc) -> decode_quoted(Rest, [Char | Acc]).%% @private Decode an unquoted string-spec decode_unquoted(string()) -> {ok|done, string(), string()}.decode_unquoted(String) -> decode_unquoted(String, []).%% @private Decode an unquoted string-spec decode_unquoted(string(), [char()]) -> {ok|done, string(), string()}.decode_unquoted([], Acc) -> {done, lists:reverse(Acc), ""};decode_unquoted([$\r,$\n | Rest], Acc) -> {done, lists:reverse(Acc), Rest};decode_unquoted([$, | Rest], Acc) -> {ok, lists:reverse(Acc), Rest};decode_unquoted([Char | Rest], Acc) -> decode_unquoted(Rest, [Char | Acc]).
Both functions that read quoted or unquoted strings work similarly, except for those that ...