Up: csv-parser   [Contents][Index] Alain Picard’s CSV Parser Code

;;                      -*- mode: lisp -*-
;; CSV parsing/writing utilities, a la Microsoft Excel.
;; Author: Alain Picard <apicard@optushome.com.au>
;;         (also alain.picard@memetrics.com)
;; Version: 0.2
;; File: $Id: csv-parser.lisp,v 1.2 2003/12/09 03:52:09 ap Exp $
;; License:
;; This code is placed under the Lesser GNU Public License (LGPL)
;; (see http://www.fsf.org/licenses/lgpl.html) as
;; clarified for Lisp by Franz when they released AllegroServe (see
;; http://allegroserve.sourceforge.net/license-allogroserve.txt)
;; What this clarification basically means is that compiling this
;; file and loading it into your lisp image, either at time of
;; delivery or runtime, does not make your program a derivative of
;; PURPOSES.  In particular, clause 5 of the LGPL is NOT invoked
;; by you embedding or loading this code, interpeted or compiled,
;; into your application.
;; Of course, if you make modifications to this file, then the terms
;; of the LGPL hold, and you must redistribute the sources of this
;; file and your modifications with your application.
;; Lastly, if you DO make useful changes to this code, I would
;; appreciate receiving the changes (though this NOT a requirement
;; of the license of this code.)
;; This software is "as is", and has no warranty of any kind.  The
;; author assumes no responsibility for the consequences of any use
;; of this software.
;; Notes
;; -----
;;  Differences from Faré's CSV reader:
;;  * this one can import DOS formatted files into a unix image,
;;    and not have the redundant ^M splattered at the end of the fields.
;;    This is important when processing a file on a Unix server which
;;    is being uploaded via HTTP by some poor schmuck on a windoze box.
;;  * Handles empty trailing fields properly
;;  * comes with test suite, to see the semantics I've chosen to implement
;;  * I've included a couple of high-level goodies, like
;;    do-csv-file and map-csv-file, as well as the ability
;;    to control the range of line-numbers on which to apply
;;    your code.  Pretty simple stuff, but useful.
;;  * You get a CSV-file writer, for free!  :-)
;;  Questions:
;;    Can someone tell me why this is more than 10X faster
;;    under CMULisp 18e than under Lispworks 4.2 ? Sheeesh.

(in-package :common-lisp-user)

(defpackage :csv-parser
  (:use :common-lisp)
  (:export  #:*field-separator*

(in-package :csv-parser)

(defparameter *field-separator* #\,
  "The character used to indicate the end of a field
   in a CSV file.")

(defparameter *quote-character* #\"
  "The character used to protect embedded field separators
   (usually commas) and whitespace within a field.

   To import the *quote-character* itself, you must have
   it printed twice in the input stream.")

(defvar *state* nil
  "Holds a function which knows how to handle chars based
   on what we've seen so far.")

(defvar *spaces-gobbled* nil
  "Keeps track of how many blanks have been skipped.  When emitting
   a field, we can trim extra right spaces, if appropriate, using this.")

(defvar *current-field* nil
  "Holds the field we are currently working on.")

(defvar *fields* nil
  "Holds the fields we have collected/parsed so far.")

(defvar *num-fields*
  "Holds the number of fields we have collected/parsed so far.")

;; Public
(defun read-csv-line (stream)
  "Read one line form a stream containing CSV data.
   Returns two values; a list of strings parsed, and
   the number of parsed values."
  (let ((*state* #'skip-white-space)
        (*spaces-gobbled* 0)
        (*fields* ())
        (*num-fields*  0)
        (*current-field* (make-empty-field)))
    (catch 'end-of-line
       (funcall *state* (read-char stream nil :eof))))
    (values (nreverse *fields*)

;; Public
(defun map-csv-file (file fn &key limit (skip-lines 0))
  "Call FN (up to LIMIT times, if specified) with
   a list containing the fields parsed from the CSV
   file FILE.

   SKIP-LINES, if provided, is the number of lines to skip
   before starting to call FN.

   *FIELD-SEPARATOR* and *QUOTE-CHARACTER* can be bound to
   modify what separates fields and delimits fields."
  (with-open-file (stream file :direction :input)
    (loop repeat skip-lines
          do (read-csv-line stream))
    (if limit
        (loop as line = (read-csv-line stream)
              while line
              repeat limit
              do (funcall fn line))
        (loop as line = (read-csv-line stream)
              while line
              do (funcall fn line)))))

;; Public
(defmacro do-csv-file (((fields num-fields) file &key limit (skip-lines 0))
                       &body body)
  "Repeatedly call BODY on CSV file FILE, binding
   FIELDS and NUM-FIELDS to a list containing the parsed fields,
   and the number of fields.
   Code runs inside a block with tagname NIL, so you
   may call (RETURN).

   *FIELD-SEPARATOR* and *QUOTE-CHARACTER* can be bound to
   modify what separates fields and delimits fields."
  (let ((stream (gensym "STREAM"))
        (count  (gensym "COUNT"))
        (glimit (gensym "LIMIT")))
    `(with-open-file (,stream ,file :direction :input)
      (loop repeat ,skip-lines
            do     (read-csv-line ,stream))
      (loop for ,count upfrom 0
            with ,glimit = ,limit
            (multiple-value-bind (,fields ,num-fields) (read-csv-line ,stream)
              (when (or (null ,fields)
                        (and ,glimit
                             (>= ,count ,glimit)))

;;;; Utilities

(defun change-state (state)
  (setf *state*
        (ecase state
          (:skip         #'skip-white-space)
          (:first-quote  #'got-first-quote)
          (:second-quote #'got-second-quote)
          (:regular      #'regular-field))))

(declaim (inline add-char))
(defun add-char (char)
  (declare  (type base-char char))
  (vector-push-extend char *current-field*))

(defun make-empty-field ()
  (make-array 0
              :fill-pointer 0
              :adjustable   t
              :element-type 'base-char))

(defun remove-last-n-chars (n)
  (setf (fill-pointer *current-field*)
        (- (length *current-field*)

(declaim (inline quote-char-p end-of-line-char-p end-of-field-char-p
(defun quote-char-p (char)
  (char= char *quote-character*))

(defun end-of-line-char-p (char)
  (char= char #\Newline))

(defun end-of-field-char-p (char)
  (char= char *field-separator*))

(defun white-space-char-p (char)
  (or (char= char #\Space)
      (char= char #\Tab)
      (char= char #\Return))) ; For DOS style line termination

;;  States:
;;    * skip-white-space (initial state).
;;    * regular-field    handle things like ` foo bar '
;;    * got-first-quote  handle things like ` " foo X'
;;    * got-second-quote handle things like ` " foo  "X '

(defun skip-white-space (char)
    ((or (eq char :eof)
         (end-of-line-char-p char))
     (when *fields*
       ;; If no fields are present, this was a completeley
       ;; blank line.  Otherwise, collect the last null field.
     (throw 'end-of-line nil))

    ((white-space-char-p char)
     ; skip

    ((end-of-field-char-p char)

    ((quote-char-p char)
     (change-state :first-quote))

     (change-state :regular)
     (add-char char))))

(defun regular-field (char)
    ((or (eq char :eof)
         (end-of-line-char-p char))
     (throw 'end-of-line nil))

    ((end-of-field-char-p char)

    ((white-space-char-p char)
     (add-char char)
     (incf *spaces-gobbled*))

    ((quote-char-p char)
     (error "Got a quote after regular characters; ~
             incorrectly formatted CSV file."))

     (setf *spaces-gobbled* 0)
     (add-char char))))

(defun got-first-quote (char)
    ((eq char :eof)
     (error "Ran out of characters before finishing quoted field."))

    ((quote-char-p char)
     (change-state :second-quote))

    (t ; collect anything else
     (add-char char))))

(defun got-second-quote (char)
    ((or (eq char :eof)
         (end-of-line-char-p char))
     (throw 'end-of-line nil))

    ((quote-char-p char)
     ;; This is the weird embedded "" scenario
     (add-char char)
     (change-state :first-quote))

    ((end-of-field-char-p char)

    ((white-space-char-p char)
     (incf *spaces-gobbled*))

     (error "Got unexpected non-blank char after end of a quoted field"))))

(defun emit-field ()
    ((eq *state* #'skip-white-space)
     (push nil *fields*))

    ((eq *state* #'got-second-quote)
     (push *current-field* *fields*))

    ((eq *state* #'regular-field)
     (remove-last-n-chars *spaces-gobbled*)
     (push *current-field* *fields*))

     (assert nil nil "Bug!")))

  (incf *num-fields*)
  (setf *spaces-gobbled* 0
        *state*          #'skip-white-space
        *current-field*  (make-empty-field)))

;;;; test harness
(trace skip-white-space got-first-quote got-second-quote regular-field
       emit-field add-char change-state remove-last-n-chars)


;;  Call this; if nothing asserts, you win.
;;  (csv-tests)

(defun csv-tests ()

(defun csv-test-blank ()
  (with-input-from-string (s "")
    (assert (eq nil (read-csv-line s)))
    (assert (eq nil (read-csv-line s)))))

(defun csv-test-blanks ()
  (with-input-from-string (s "  ")
    (assert (eq nil (read-csv-line s)))
    (assert (eq nil (read-csv-line s)))))

(defun csv-test-empty ()
  (with-input-from-string (s " ,,   , ")
    (assert (equal (list nil nil nil nil) (read-csv-line s)))
    (assert (eq nil (read-csv-line s)))))

(defun csv-test-simple ()
  (with-input-from-string (s "foo,bar,baz")
    (assert (equal (list "foo" "bar" "baz")
                   (read-csv-line s)))))

(defun csv-test-quoted ()
  (with-input-from-string (s "\"foo\",\"bar\",\"baz\"")
    (assert (equal (list "foo" "bar" "baz")
                   (read-csv-line s)))))

(defun csv-test-space-handling ()
  ;; leading/trailing blanks (but not intra word blanks)
  ;; are stripped unless the whole thing is quoted
  (with-input-from-string (s "\"  foo  bar  \",  foo bar  ")
    (assert (equal (list "  foo  bar  " "foo bar")
                   (read-csv-line s)))))

(defun csv-test-other-delimiters ()
  (let ((*field-separator* #\|)
        (*quote-character* #\'))
    (with-input-from-string (s "'foo'|'bar'|'baz'")
      (assert (equal (list "foo" "bar" "baz")
                     (read-csv-line s))))))

(defun csv-test-double-quotes ()
  (with-input-from-string (s "\"foo \"\" bar\",2,3,\"\", \" \"")
    (assert (equal (list "foo \" bar" "2" "3" "" " ")
                   (read-csv-line s)))))

(defun csv-test-embedded-lines ()
  (with-input-from-string (s "\"foo
 and bar\",\" 2 \",3")
    (assert (equal (list "foo
 and bar" " 2 " "3")
                   (read-csv-line s)))))

(defun csv-test-embedded-commas ()
  (with-input-from-string (s "\"foo , bar\",2,3")
    (assert (equal (list "foo , bar" "2" "3")
                   (read-csv-line s)))))



;;;; Writing utilities

(defun write-csv-line (stream fields)
  "Write one CSV line to STREAM, containing fields.
   FIELDS is any (1d) sequence containing strings, symbols or numbers.

   Try to write it as esthetically pleasing as possible,
   i.e. don't output the *quote-character* unless necessary
   to protect the integrity of the data."

  (etypecase fields
    (cons   (write-csv-line-on-cons stream fields))
    (array  (write-csv-line-on-array stream fields))))

(defun write-csv-line-on-cons (stream fields)
  (loop for rest on fields
        while rest
        (write-csv-field stream (first rest))
        (when (cdr rest)
          (write-char *field-separator* stream)))
  (terpri stream))

(defun write-csv-line-on-array (stream fields)
  (loop for field across fields
        repeat (1- (length fields))
        (write-csv-field stream field)
        (write-char *field-separator* stream)

        (write-csv-field stream (aref fields (1- (length fields))))
        (terpri stream)))

(defun write-csv-field (stream field)
  (etypecase field
    (null t)
    (number (princ field stream))
    (string (write-csv-string-safely stream field))
    (symbol (write-csv-string-safely stream (symbol-name field)))))

(defun special-char-p (char)
  (or (char= char *field-separator*)
      (char= char *quote-character*)
      (white-space-char-p char)))

(defun write-csv-string-safely (stream string)
  (if (find-if #'special-char-p string)
      (write-protected-copy stream string)
      (princ string stream)))

(defun write-protected-copy (stream field)
  (write-char *quote-character* stream)
  (loop for c across field
        (write-char c stream)
        (when (char= c *quote-character*)
          ;; Double it
          (write-char c stream)))
  (write-char *quote-character* stream))


Up: csv-parser   [Contents][Index]