...

/

CSV Parsing: The CSV Parser

CSV Parsing: The CSV Parser

Take a look at the implementation of the CSV parser in our example application.

The CSV parser

We can now move on to implementing a CSV parser. Here is a possible implementation:

Press + to interact
-module(bday_csv).
-export([encode/1, decode/1]).
%% @doc Take a list of maps with the same keys and transform them
%% into a string that is valid CSV, with a header.
-spec encode([map()]) -> string().
encode([]) -> "";
encode(Maps) ->
Keys = lists:join(",", [escape(Name) || Name <- maps:keys(hd(Maps))]),
Vals = [lists:join(",", [escape(Field) || Field <- maps:values(Map)])
|| Map <- Maps],
lists:flatten([Keys, "\r\n", lists:join("\r\n", Vals)]).
%% @doc Take a string that represents a valid CSV data dump
%% and turn it into a list of maps with the header entries as keys
-spec decode(string()) -> list(map()).
decode("") -> [];
decode(CSV) ->
{Headers, Rest} = decode_header(CSV, []),
Rows = decode_rows(Rest),
[maps:from_list(lists:zip(Headers, Row)) || Row <- Rows].

Note: Decoding is done by fetching the headers, then fetching all of the rows. A header line is parsed by reading each column name one at a time, and a row is parsed by reading each field one at a time.

First, there’s the public interface with two functions:

  1. encode/1
  2. decode/1.

The functions are fairly straightforward, delegating the more complex operations to private helper functions. Let’s start by looking at those helping with encoding:

Press + to interact
%%%%%%%%%%%%%%%
%%% PRIVATE %%%
%%%%%%%%%%%%%%%
%% @private return a possibly escaped (if necessary) field or name
-spec escape(string()) -> string().
escape(Field) ->
case escapable(Field) of
true -> "\"" ++ do_escape(Field) ++ "\"";
false -> Field
end.
%% @private checks whether a string for a field or name needs escaping
-spec escapable(string()) -> boolean().
escapable(String) ->
lists:any(fun(Char) -> lists:member(Char, [$",$,,$\r,$\n]) end, String).
%% @private replace escapable characters (only `"') in CSV.
%% The surrounding double-quotes are not added; caller must add them.
-spec do_escape(string()) -> string().
do_escape([]) -> [];
do_escape([$"|Str]) -> [$", $" | do_escape(Str)];
do_escape([Char|Rest]) -> [Char | do_escape(Rest)].

If a string is judged to need escaping (according to escapable/1), then the string is wrapped in double quotes (") and all double quotes inside of it are escaped with another double quote. With this, encoding is covered. Next, there are decoding’s private functions:

Press + to interact
%% @private Decode the entire header line, returning all names in order
-spec decode_header(string(), [string()]) -> {[string()], string()}.
decode_header(String, Acc) ->
case decode_name(String) of
{ok, Name, Rest} -> decode_header(Rest, [Name | Acc]);
{done, Name, Rest} -> {[Name | Acc], Rest}
end.
%% @private Decode all rows into a list.
-spec decode_rows(string()) -> [[string()]].
decode_rows(String) ->
case decode_row(String, []) of
{Row, ""} -> [Row];
{Row, Rest} -> [Row | decode_rows(Rest)]
end.
%% @private Decode an entire row, with all values in order
-spec decode_row(string(), [string()]) -> {[string()], string()}.
decode_row(String, Acc) ->
case decode_field(String) of
{ok, Field, Rest} -> decode_row(Rest, [Field | Acc]);
{done, Field, Rest} -> {[Field | Acc], Rest}
end.
%% @private Decode a name; redirects to decoding quoted or unquoted text
-spec decode_name(string()) -> {ok|done, string(), string()}.
decode_name([$" | Rest]) -> decode_quoted(Rest);
decode_name(String) -> decode_unquoted(String).
%% @private Decode a field; redirects to decoding quoted or unquoted text
-spec decode_field(string()) -> {ok|done, string(), string()}.
decode_field([$" | Rest]) -> decode_quoted(Rest);
decode_field(String) -> decode_unquoted(String).

Decoding is done by fetching the headers, then fetching all of the rows. A header line is parsed by reading each column name one at a time, and a row is parsed by reading each field one at a time. At the end we can see that both fields and names are actually implemented as quoted or unquoted strings:

Press + to interact
%% @private Decode a quoted string
-spec decode_quoted(string()) -> {ok|done, string(), string()}.
decode_quoted(String) -> decode_quoted(String, []).
%% @private Decode a quoted string
-spec decode_quoted(string(), [char()]) -> {ok|done, string(), string()}.
decode_quoted([$"], Acc) -> {done, lists:reverse(Acc), ""};
decode_quoted([$",$\r,$\n | Rest], Acc) -> {done, lists:reverse(Acc), Rest};
decode_quoted([$",$, | Rest], Acc) -> {ok, lists:reverse(Acc), Rest};
decode_quoted([$",$" | Rest], Acc) -> decode_quoted(Rest, [$" | Acc]);
decode_quoted([Char | Rest], Acc) -> decode_quoted(Rest, [Char | Acc]).
%% @private Decode an unquoted string
-spec decode_unquoted(string()) -> {ok|done, string(), string()}.
decode_unquoted(String) -> decode_unquoted(String, []).
%% @private Decode an unquoted string
-spec decode_unquoted(string(), [char()]) -> {ok|done, string(), string()}.
decode_unquoted([], Acc) -> {done, lists:reverse(Acc), ""};
decode_unquoted([$\r,$\n | Rest], Acc) -> {done, lists:reverse(Acc), Rest};
decode_unquoted([$, | Rest], Acc) -> {ok, lists:reverse(Acc), Rest};
decode_unquoted([Char | Rest], Acc) -> decode_unquoted(Rest, [Char | Acc]).

Both functions that read quoted or unquoted strings work similarly, except for those that ...