I was looking for a means of parsing a large XML document while retaining the ability to process particular elements of interest. In this post I will demostrate a way of parsing an XML document with a SAX XMLReader and use an XMLFilter implementation that watches for element(s) nominated by name.
The class below performs partial unmarshalling on a DOM node representing a fragment of the overall document. When the start of the element is discovered we start capturing into the DOM Node and when the corresponding element end is discovered the DOM node is unmarshalled using JAXB, the provided listeners are called passing the recognised object to a handler method.
The following parameterised class implements the chunking facility.
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 |
package ie.robb.sax.helpers; import java.util.ArrayList; import java.util.List; import javax.xml.bind.JAXBContext; import javax.xml.bind.JAXBElement; import javax.xml.bind.JAXBException; import javax.xml.bind.Unmarshaller; import javax.xml.parsers.DocumentBuilderFactory; import org.w3c.dom.Attr; import org.w3c.dom.Document; import org.w3c.dom.Node; import org.xml.sax.Attributes; import org.xml.sax.SAXException; import org.xml.sax.helpers.XMLFilterImpl; public class SAXElementToObject<T> extends XMLFilterImpl { public interface Listener<T> { void objectRead(T object); void unmarshalError(String message); } public enum FirstOrAllEnum { FIRST, ALL } private String namespaceUri; private String name; private Document document; private Node current; private JAXBContext ctx; private Unmarshaller unmarshaller; private T currentObject; private Class<T> type; private List<Listener<T>> listeners = new ArrayList<Listener<T>>(); private int depth = 0; private FirstOrAllEnum firstOrAll; private boolean found = false; private boolean useNamespace = false; public SAXElementToObject(String name, Class<T> type) throws Throwable { init(name, "", type, FirstOrAllEnum.ALL); } public SAXElementToObject(String name, FirstOrAllEnum firstOrAll, Class<T> type) throws Throwable { init(name, "", type, firstOrAll); } public SAXElementToObject(String name, Class<T> type, Listener<T> listener) throws Throwable { init(name, "", type, FirstOrAllEnum.ALL); this.listeners.add(listener); } public SAXElementToObject(String name, FirstOrAllEnum firstOrAll, Class<T> type, Listener<T> listener) throws Throwable { init(name, "", type, firstOrAll); this.listeners.add(listener); } public SAXElementToObject(String name, String namespaceUri, Class<T> type) throws Throwable { init(name, namespaceUri, type, FirstOrAllEnum.ALL); } public SAXElementToObject(String name, String namespaceUri, FirstOrAllEnum firstOrAll, Class<T> type) throws Throwable { init(name, namespaceUri, type, firstOrAll); } public SAXElementToObject(String name, String namespaceUri, Class<T> type, Listener<T> listener) throws Throwable { init(name, namespaceUri, type, FirstOrAllEnum.ALL); this.listeners.add(listener); } public SAXElementToObject(String name, String namespaceUri, FirstOrAllEnum firstOrAll, Class<T> type, Listener<T> listener) throws Throwable { init(name, namespaceUri, type, firstOrAll); this.listeners.add(listener); } private void init(String name, String namespaceUri, Class<T> type, FirstOrAllEnum firstOrAll) throws Throwable { this.name = name; this.namespaceUri = namespaceUri; this.type = type; this.firstOrAll = firstOrAll; this.ctx = JAXBContext.newInstance(type); this.unmarshaller = ctx.createUnmarshaller(); // create a DOM document to build fragments DocumentBuilderFactory factory = DocumentBuilderFactory.newInstance(); factory.setNamespaceAware(true); this.document = factory.newDocumentBuilder().newDocument(); useNamespace = ! this.namespaceUri.equals(""); } public void addListener(Listener<T> listener) { listeners.add(listener); } @Override public void startElement(String uri, String localName, String qName, Attributes atts) throws SAXException { boolean isRoot = false; if(firstOrAll == FirstOrAllEnum.ALL || firstOrAll == FirstOrAllEnum.FIRST && !found) { if( useNamespace && namespaceUri.equals(uri) && name.equals(qName)) { current = document.createElementNS(uri, qName); isRoot = true; depth = 1; found = true; } else if(!useNamespace && name.equals(qName)) { current = document.createElement(qName); isRoot = true; depth = 1; found = true; } } if(depth > 0) { for(int i = 0 ; i < atts.getLength() ; i ++) { Attr att = document.createAttribute(atts.getQName(i)); att.setValue(atts.getValue(i)); current.appendChild(att); } if(!isRoot) { current = current.appendChild(document.createElement(qName)); depth ++; } } super.startElement(uri, localName, qName, atts); } @Override public void endElement(String uri, String localName, String qName) throws SAXException { if(depth == 1) { try { JAXBElement<T> object = unmarshaller.unmarshal(current, type); currentObject = object.getValue(); for(Listener<T> listener : listeners) listener.objectRead(currentObject); } catch(JAXBException e) { for(Listener<T> listener : listeners) listener.unmarshalError(e.getMessage()); } current = null; depth = 0; } else depth --; if(depth > 0 ){ current = current.getParentNode(); } super.endElement(uri, localName, qName); } @Override public void characters(char[] ch, int start, int length) throws SAXException { if(current != null) { current.appendChild(document.createTextNode(new String(ch, start, length))); } super.characters(ch, start, length); } } |
In the next part of the series I will demonstrate how to use the class above.
Leave a Reply