Saturday, November 14, 2009

XML to CSV

Often there is a need to load XML data into database. With the popularity of the Web Services and Java technology, you get lots of these type of requests. Ideally XML is not really good for high volume data.

I had used in past a DOM parser to parse and load XML data into database. It was awfully slow. First a DOM parser need huge memory for bigger files, second the parsing itself is painfully slower. Third, if you are doing a record-by-record insert from XML into database through Java, the process is as good as dead for any data length over 10000 records.

So, I thought of converting XML into a CSV and load that CSV into database using SQL*Loader. This way the process is 1000x times faster. I put together a small java code that uses SAX parser and produces a csv.

Here is the syntax to invoke Java parser

.
java GenericSaxParseToCsv "Feed.xml" "COLUMN_1,COLUMN_2,COLUMN_3,COLUMN_4,COLUMN_5,COLUMN_6,COLUMN_7,COLUMN_8,COLUMN_8,COLUMN_9,COLUMN_10" "123456" "2" > Feed.csv

1. Feed.xml is in the input file
2. COLUMN_1,COLUMN_2,COLUMN_3,COLUMN_4,COLUMN_5,COLUMN_6,COLUMN_7,COLUMN_8,COLUMN_8,COLUMN_9,COLUMN_10
   are the columns in the XML, they would appear in the same sequence in the 
   outputted csv as passed here
3. 123456 - is request id for the whole set
4. 2 
   is number of levels in XML, basically this would control when you want to 
   start your new record in CSV. 
   Passing 2 would create new record in CSV when XML parser encounters COLUMN_1 or 
   COLUMN_2 is any sequence.

And here is the Java Parser
.

//
//    Generic SAX Parser
//    Source http://appswork.blogspot.com/



import java.io.Writer;
import java.io.OutputStreamWriter;
import java.io.File;
import java.io.IOException;

import org.xml.sax.AttributeList;
import org.xml.sax.HandlerBase;
import org.xml.sax.SAXException;

import javax.xml.parsers.SAXParserFactory;
import javax.xml.parsers.SAXParser;

import java.util.StringTokenizer;

public class GenericSaxParseToCsv extends HandlerBase
{

    protected static boolean showVal  = false;
    protected static int nCurrCounter = 1;
    protected static int nLineCounter = 0;
    protected static int nRequestId   = 0;
    protected static int nLevel       = 1;
    protected static int nColumnCount = 0;
    protected static int nCurrentColPos = -1;
    protected static String vDetLevel;

    static private Writer wOut;

    protected static String vColumnHeaders;
    protected static String vColNameValP[][];
    protected static StringTokenizer vColNamesToken;
    protected static String vInputFile;

    public static void main (String argv [])
    {
      // Use the default (non-validating) parser

      vInputFile           = argv[0];
      vColumnHeaders       = argv[1];
      
      try
      {
        nRequestId         = Integer.parseInt(argv[2]);
      }
      catch(Exception e)
      {
      }

      try
      {
        nLevel             = Integer.parseInt(argv[3]);
      }
      catch(Exception e)
      {
      }

      vColNamesToken = new StringTokenizer(vColumnHeaders,",");
      vColNameValP   = new String [2] [vColNamesToken.countTokens()];

      int i = 0;
      while(vColNamesToken.hasMoreTokens())
      {
        vColNameValP[0][i] = vColNamesToken.nextToken().trim();
        vColNameValP[1][i] = "";
        i = i + 1;
      }

      nColumnCount = i;

      SAXParserFactory factory = SAXParserFactory.newInstance();
      try {
          // Set up output stream
          wOut = new OutputStreamWriter (System.out, "UTF8");

          // Parse the input
          SAXParser saxParser = factory.newSAXParser();
          saxParser.parse( new File(vInputFile), new GenericSaxParseToCSV() );

        } catch (Throwable t) {
            t.printStackTrace ();
        }
        System.exit (0);
    }


    public void startDocument ()
    throws SAXException
    {
        showData("Request_ID,"+"LineCounter,"+"DetLevel,"+vColumnHeaders);
        newLine();
        showData("~"+nRequestId+"~"+","+"~"+"STARTREQUEST"+"~"+","+"~"+vInputFile+"~");
        newLine();
        showData("~"+nRequestId+"~"+","+"~"+nLineCounter+"~"+","+"~"+"L0"+"~"+",");
    }

    public void endDocument ()
    throws SAXException
    {
        try {
             printColValues();
             newLine();
             showData("~"+nRequestId+"~"+","+"~"+"ENDREQUEST"+"~"+","+"~"+vInputFile+"~");
             newLine();
             wOut.flush ();
        } catch (IOException e) {
            throw new SAXException ("I/O error", e);
        }
    }

    public void startElement (String name, AttributeList attrs)
    throws SAXException
    {
        for (int i =0; i nCurrCounter)
         {
            printColValues();
            newLine();
            showData("~"+nRequestId+"~"+","+"~"+nLineCounter+"~"+","+"~"+vDetLevel+"~"+",");
            nCurrCounter = nLineCounter;
         }

        if (showVal && !((System.getProperty("line.separator")).equals(s)) && !("\t".equals(s)) && !("\n".equals(s)))
        {
          int ascVal  = s.charAt(0);
          if (ascVal != 10)
            if (nCurrentColPos > -1 )
            {
              vColNameValP[1][nCurrentColPos] = vColNameValP[1][nCurrentColPos] + s;
            }
        }
    }

    private void showData (String s)
    throws SAXException
    {
        try {
            wOut.write (s);
            wOut.flush ();
        } catch (IOException e) {
            throw new SAXException ("I/O error", e);
        }
    }

    // Start a new line
    private void newLine ()
    throws SAXException
    {
        String lineEnd =  System.getProperty("line.separator");
        try {
            wOut.write (lineEnd);
        } catch (IOException e) {
            throw new SAXException ("I/O error", e);
        }
    }
 } // class
.

This java parser would work real cool for simple XML formats for any level of nested elements. It would expect, though non-repeating unique element tags in the XML. And it would not work with XML that has id attributes inside elements.

Cheers.
.

No comments: