View Javadoc

1   package ca.uhn.hl7v2.preparser;
2   
3   import java.io.IOException;
4   import java.util.ArrayList;
5   import java.util.Collection;
6   import java.util.Iterator;
7   import java.util.List;
8   import java.util.Map;
9   import java.util.Properties;
10  import java.util.SortedMap;
11  import java.util.TreeMap;
12  
13  import javax.xml.parsers.ParserConfigurationException;
14  import javax.xml.parsers.SAXParser;
15  import javax.xml.parsers.SAXParserFactory;
16  
17  import org.xml.sax.Attributes;
18  import org.xml.sax.InputSource;
19  import org.xml.sax.SAXException;
20  import org.xml.sax.SAXParseException;
21  import org.xml.sax.helpers.DefaultHandler;
22  
23  import ca.uhn.hl7v2.HL7Exception;
24  
25  public class XML
26  {
27  	@SuppressWarnings("serial")
28  	protected static class StopParsingException extends SAXException
29  	{
30  		public StopParsingException() 
31  		{
32  			super("ca.uhn.hl7.....StopParsingException");
33  		}
34  	}
35  
36  	/** the SAXParser reports parsing events to an object of this class.
37  	We keep track of some parsing state, and the Properties object that 
38  	we're supposed to write our data to.
39  	*/
40  	static protected class HL7MessageHandler extends DefaultHandler 
41  	{
42  		/* m_props & m_msgMask should be set by the user of this handler before
43  		they pass this handler to SAXParser.parse() or whatever */
44  
45  		/** The data that is found while parsing, and which passes m_msgMask, 
46  		will be dumped to m_props, as (DatumPath.toString() / text) key/value
47  		pairs */
48  		public Properties m_props = null;
49  
50  		/** Specifies what parts of a message should be dumped to m_props. 
51  		*/
52  		public Collection<DatumPath> m_msgMask = null;
53  
54  		/* All other fields are parser state. */
55  
56  		protected boolean m_startedDocument = false;
57  
58  		/* m_msgID / m_curPath together keep track of where we are in the document.
59  
60  		If m_msgID.length() != 0, then we're within the message element.  (We're only
61  		expecting one message per document.)  Then m_msgID will be the name of the 
62  		message.  ("ACK" or whatever).  
63  
64  		m_curPath keeps track of where within the message we are.  See notes at 
65  		DatumPath class definition.  If m_curPath.size() != 0, then we must be 
66  		within a message.
67  
68  		At any point in the code below: 
69  
70  		if m_msgID.length() == 0, 
71  			then m_curPath().size() == 0
72  
73  		if m_curPath.length()  != 0
74  			then m_msgID.length() != 0
75  		
76  		Note that our DatumPaths count indices starting from 0 (not 1) -- they're 
77  		only converted to 1-based in the string representations that wind up 
78  		as m_props keys.
79  		*/
80  		StringBuffer m_msgID = new StringBuffer();
81  		DatumPath m_curPath = new DatumPath();
82  
83  		/* the location in the document of the last datum we dumped to m_props. */
84  		DatumPath m_lastDumpedPath = new DatumPath();
85  
86  		/** For handling repeat segments.   segmentID (String) -> next repeat idx
87  		(Integer).  So when we hit a segment ZYX, we'll know how many times we've
88  		hit a ZYX before, and set the segmentRepIdx part of m_curPath
89  		appropriately. */
90  		SortedMap<String, Integer> m_segmentId2nextRepIdx = new TreeMap<String, Integer>();
91  
92  		/* m_depthWithinUselessElement and m_depthWithinUsefulElement 
93  		reflect what m_msgMask thinks about our location in the document at any
94  		given time.  
95  
96  		Both should always be >= -1.  Note that both can be >= 0 at the same time
97  		-- explained in a minute....
98  
99  		If m_depthWithinUsefulElement >= 0, this means that we are however deep
100 		(in terms of nested elements: 0 => just within) within an area of the
101 		message that passes m_msgMask.  We should should dump whatever we find
102 		there to m_props.  As we move around within such an element, we will still
103 		update m_curPath appropriately.
104 
105 		If m_depthWithinUsefulElement >= 0, we are however deep within an element
106 		which either made no sense (eg. <ZZZ.1> where we were expecting a <ZYX.1>
107 		-- a few other things maybe), or more importantly that we're within an
108 		element that otherwise has no hope of having any useful elements within it
109 		according to m_msgMask.  (eg. m_msgMask says it wants only ZYX segment
110 		contents, we're in an <MSH>).  So we can safely ignore all content within,
111 		and just keep track of how deep we are within this useless element (with
112 		m_depthWithinUselessElement, of course.)  We don't update m_curPath when
113 		m_depthWithinUselessElement >= 0, there's no point and how would we
114 		extract information for the DatumPath out of nonsensical element names
115 		anyway.
116 
117 		If they are both >= 0, this means that there we've found some useless
118 		stuff (nonsensical element names?) within a known-useful element.
119 		*/
120 		int m_depthWithinUsefulElement = -1, m_depthWithinUselessElement = -1;
121 
122 		/* With this we keep the text that we've found within a certain element.
123 		It's cleared whenever we enter a (sub) element or leave an element. */
124 		StringBuffer m_chars = new StringBuffer(10);
125 
126 		public HL7MessageHandler()
127 		{
128 			this.clear();
129 		}
130 
131 		void clear()
132 		{
133 			// reset the state (m_props & m_msgMask are not state)
134 			m_startedDocument = false;
135 			m_msgID.delete(0, m_msgID.length());
136 			m_curPath.clear();
137 			// will always be "less than" (according to DatumPath.numbersLessThan)
138 			// any sensible DatumPath: 
139 			m_lastDumpedPath.clear().add(new String()).add(-42).add(-42).add(-42).add(-42).add(-42);
140 			m_segmentId2nextRepIdx.clear();
141 			m_depthWithinUsefulElement = -1;
142 			m_depthWithinUselessElement = -1;
143 			m_chars.delete(0, m_chars.length());
144 		}
145 
146 		public void startDocument() throws SAXException
147 		{
148 			boolean ok = false;
149 			if(!m_startedDocument && (m_props != null)) {
150 				m_startedDocument = true;
151 				ok = true;
152 			}
153 
154 			if(!ok) {
155 				clear();
156 				throw new StopParsingException();
157 			}
158 		}
159 
160 		public void endDocument() throws SAXException
161 		{
162 			boolean ok = false;
163 			if(m_startedDocument) {
164 				this.clear();
165 				ok = true;
166 			}
167 
168 			if(!ok) {
169 				clear();
170 				throw new StopParsingException();
171 			}
172 		}
173 
174 		public void startElement(String uri, String localName, String qName, 
175 				Attributes attributes) throws SAXException 
176 		{
177 			//System.err.println("startelem: " + qName + " curpathsize; " +
178 			//m_curPath.size());
179 			boolean ok = false;
180 			if(m_startedDocument) {
181 				// A single unit of text data will be within a single element, 
182 				// -- none of it will be in sub-elements and there will be no 
183 				// sub-elements fragmenting the data text.
184 				// Right now we're entering a new element: this means that anything
185 				// in m_chars will be whitespace (likely), or text left over from, 
186 				// say, the last field, or text that was somewhere it shouldn't have been.
187 				// (ex. "<ZYX.9> shouldn't be here <PT.1> P </PT.1> </ZYX.9>"
188 				m_chars.delete(0, m_chars.length());
189 
190 				if(m_depthWithinUselessElement >= 0) {
191 					++m_depthWithinUselessElement;
192 				}
193 				else {
194 					int oldCurPathSize = m_curPath.size();
195 					if(tryToGrowDocLocationFromElementName(m_msgID, m_curPath, 
196 						m_segmentId2nextRepIdx, m_lastDumpedPath, qName)) 
197 					{
198 						if(m_curPath.size() > oldCurPathSize) {
199 							// assert (m_depthWithinUselessElement == -1) // m_curPath
200 							// should not have grown if we're within a useless element.
201 							if(m_depthWithinUsefulElement == -1) {
202 								// this new element could match one of the DatumPaths in
203 								// m_msgMask -- if that's the case, we've just entered a
204 								// useful element.
205 								// TODO: functional stylee (a la C++'s std::accumulate) ? 
206 								boolean curPathStartsWithAMaskElem = false;
207 								for(Iterator<DatumPath> maskIt = m_msgMask.iterator(); 
208 									!curPathStartsWithAMaskElem && maskIt.hasNext(); )
209 								{
210 									curPathStartsWithAMaskElem 
211 										= m_curPath.startsWith(maskIt.next());
212 								}
213 
214 								if(curPathStartsWithAMaskElem) 
215 									m_depthWithinUsefulElement = 0;
216 								else {
217 									// so this element we're entering is not specified by m_msgMask
218 									// to be useful -- but might it contains elements that
219 									// are?
220 									boolean aMaskElemStartsWithCurPath = false;
221 									for(Iterator<DatumPath> maskIt = m_msgMask.iterator(); 
222 										!aMaskElemStartsWithCurPath && maskIt.hasNext(); )
223 									{
224 										aMaskElemStartsWithCurPath 
225 											= maskIt.next().startsWith(m_curPath);
226 									}
227 
228 									if(!aMaskElemStartsWithCurPath) {
229 										// ... nope!  useless.
230 										m_depthWithinUselessElement = 0;
231 										m_curPath.setSize(oldCurPathSize);
232 									} // else => ok, carry on, m_depthWithinUse{less,ful}Element
233 									// still both -1.
234 								}
235 							}
236 							// else => already within a useful element, don't need to compare 
237 							// against m_msgMask.
238 						}
239 					}
240 					else
241 						m_depthWithinUselessElement = 0;
242 				}
243 				ok = true;
244 			}
245 
246 			if(!ok) {
247 				clear();
248 				throw new StopParsingException();
249 			}
250 		}
251 
252 		/* doc location == msgID & curPath together.  
253 		If we've encountered an element called "elementNam", then this tries 
254 		to determine what it is, based on what we already know about the document.
255 		returns true if we can make sense of this new element name given the
256 		position we're at (represented by msgID / curPath), 
257 		false if we can't (which probably means this should be a useless element). 
258 		returning true doesn't mean that we actually changed msgID or curPath, it
259 		might mean that we just passed through a segment group element OK.
260 		*/
261 		protected static boolean tryToGrowDocLocationFromElementName(
262 			StringBuffer msgID /*in/out*/, DatumPath curPath /*in/out*/, 
263 			Map<String, Integer> segmentId2nextRepIdx /*in/out*/, DatumPath lastDumpedPath /*in*/, 
264 			String elementName /*in*/)
265 		{
266 			boolean ok = false; // ok == can we make sense of this new element?
267 			// hmm ... where are we in the document: 
268 			if((msgID.length() == 0) && (curPath.size() == 0)) {
269 				// we're entering a message
270 				msgID.replace(0, msgID.length(), elementName);
271 				segmentId2nextRepIdx.clear();
272 				ok = true;
273 			}
274 			else if((msgID.length() > 0) && (curPath.size() == 0)) {
275 				// we're entering either a segment-group element (eg. <ADT_A01.PROCEDURE>)
276 				// or an actual segment element.
277 				if(!(elementName.startsWith("" + msgID + '.'))) {
278 					// must be an actual segment.
279 					curPath.add(elementName);
280 
281 					if(segmentId2nextRepIdx.containsKey(elementName)) 
282 						curPath.add(segmentId2nextRepIdx.get(elementName));
283 					else
284 						curPath.add(new Integer(0));
285 
286 					segmentId2nextRepIdx.put(elementName, ((Integer)curPath.get(curPath.size()-1)).intValue() + 1);
287 				}
288 				ok = true;
289 			}
290 			else if((msgID.length() > 0) && (curPath.size() > 0)) {
291 				// we're entering a field or a component or a subcomponent.
292 				if(curPath.size() == 2) { // we're entering a field element
293 					// all fields should start with segment-ID + '.' 
294 					if(elementName.startsWith("" + curPath.get(0) + '.')) {
295 						try {
296 							int fieldIdxFromElementName 
297 								= Integer.parseInt(elementName.substring(elementName.indexOf('.') + 1));
298 
299 							curPath.add(new Integer(fieldIdxFromElementName));
300 
301 							// now add the repetition idx to curPath: 
302 							if((lastDumpedPath.size() >= 4) 
303 								&& (((Integer)lastDumpedPath.get(2)).intValue() 
304 									== fieldIdxFromElementName))
305 							{
306 								// lastDumpedPath has a fieldIdx and a fieldRepIdx.
307 								curPath.add(new Integer(((Integer)lastDumpedPath.get(3)).intValue() + 1));
308 							}
309 							else
310 								curPath.add(new Integer(0));
311 
312 							ok = true;
313 						} catch(NumberFormatException e) {}
314 					} // else => this isn't a field -- must be useless.
315 				}
316 				else if((curPath.size() == 4) || (curPath.size() == 5)) {
317 					// we're entering a component or subcomponent element
318 					try {
319 						int idxFromElementName 
320 							= Integer.parseInt(elementName.substring(elementName.indexOf('.') + 1));
321 						curPath.add(new Integer(idxFromElementName));
322 						ok = true;
323 					} catch(NumberFormatException e) {}
324 				}
325 			}
326 			return ok;
327 		}
328 
329 		public void endElement(String uri, String localName, String qName) 
330 			throws SAXException 
331 		{
332 			//System.err.println("endElement: " + qName);
333 			boolean ok = false;
334 			if(m_startedDocument) {
335 				if(m_depthWithinUselessElement >= 0) {
336 					--m_depthWithinUselessElement;
337 					ok = true;
338 				}
339 				else {
340 					if((m_msgID.length() > 0) && (m_curPath.size() == 0)) {
341 						// we're exiting either a message element or a 
342 						// segment group element.
343 						if((""+qName).compareTo(""+m_msgID) == 0)
344 							m_msgID.delete(0, m_msgID.length()); // => exiting message element
345 						// else => segment group element -- do nothing.
346 
347 						ok = true;
348 					}
349 					else if((m_msgID.length() > 0) && (m_curPath.size() > 0)) {
350 						tryToDumpDataToProps();
351 
352 						if(m_curPath.size() == 2) {
353 							// exiting a segment element
354 							m_curPath.setSize(0);
355 							ok = true;
356 						}
357 						else if(m_curPath.size() == 4) {
358 							// exiting a field element 
359 							m_curPath.setSize(2);
360 							ok = true;
361 						}
362 						else if((m_curPath.size() == 5) || (m_curPath.size() == 6)) {
363 							// exiting a component or a subcomponent
364 							m_curPath.setSize(m_curPath.size() - 1);
365 							ok = true;
366 						}
367 					}
368 
369 					if(m_depthWithinUsefulElement >= 0) 
370 						--m_depthWithinUsefulElement;
371 				}
372 			}
373 
374 			if(!ok) {
375 				clear();
376 				throw new StopParsingException();
377 			}
378 		}
379 
380 		/** try to dump whatever we've got in m_chars to m_props, 
381 		with a key of m_curPath.toString(). 
382 		*/
383 		protected void tryToDumpDataToProps()
384 		{
385 			if((m_curPath.size() >= 2) && (m_depthWithinUselessElement == -1)) {
386 				/* m_curPath.toString() will be the property key whose value will be
387 				m_chars.
388 
389 				This is (part of) what m_lastDumpedPath is for: With, for example "<ZYX.9>
390 				<PT.1>P</PT.1> </ZYX.9>" we might have had a m_curPath containing something
391 				like [ZYX, 0, 9, 0, 0] when we exited the PT.1 element.  (note: internal
392 				DatumPath elements are 0-indexed, string representations of DatumPaths and
393 				the XML text is 1-indexed.)  So in m_props the key for "P" would have been
394 				"ZYX[0]-9[0]-1-1".  (the last "-1" is a default that got added by
395 				toString()).
396 				
397 				Then we would have exited the PT.3 element, changed m_curPath to [ZYX, 0,
398 				9, 0], picked up the whitespace between </PT.3> and </ZYX.9>, and when
399 				exiting the ZYX.9 element, we might have written that whitespace to m_props
400 				with a key of the toString() of [ZYX, 0, 9, 0]; that is, "ZYX[0]-9[0]-1-1":
401 				the same as the key for the "P" ... clobbering "P" in m_props with
402 				whitespace.
403 
404 				But since we know that HL7 fields / components / etc are always in order
405 				(numerically), we can count on m_lastDumpedPath and use
406 				DatumPath.numbersLessThan to avoid the clobbering.
407 				*/
408 				if((m_lastDumpedPath.get(0).equals(m_curPath.get(0))) 
409 						? (m_lastDumpedPath.numbersLessThan(m_curPath)) 
410 						: true)
411 				{
412 					if(m_depthWithinUsefulElement >= 0) {
413 						// TODO: remove!  or assert 
414 						if(m_props.containsKey("" + m_curPath)) 
415 							System.err.println("ALAAAARM: CLOBBERING PROPERTY in " + getClass());
416 
417 						m_props.setProperty("" + m_curPath, "" + m_chars);
418 						m_lastDumpedPath.copy(m_curPath);
419 						m_chars.delete(0, m_chars.length());
420 					}
421 				}
422 			}
423 		}
424 
425 		public void characters(char[] chars, int start, int length)
426 		{
427 			// note that a contiguous run of characters in the document 
428 			// might get reported to us in several chunks. 
429 			// (In the order that the text appears in the document, 
430 			// non-overlapping and with no gaps between chunks.) 
431 			// An entity like &amp; will reach us as an actual & character.
432 			
433 			if((m_msgID.length() > 0) && (m_curPath.size() >= 4)) {
434 				m_chars.append(chars, start, length);
435 			}
436 		}
437 
438 		public void ignoreableWhitespace(char []chars, int start, int length)
439 		{
440 			// it's unclear which whitespace is considered ignorable for us.  
441 			// what the heck, add it to m_chars. 
442 			characters(chars, start, length);
443 		}
444 
445 		public void error(SAXParseException e)
446 		{
447 			// TODO: remove.
448 			System.err.println("Error in " + getClass() + ": " + e);
449 		}
450 
451 		public void fatalError(SAXParseException e) throws SAXException 
452 		{
453 			throw e;
454 		}
455 	}
456 
457 	/** parse message according to our HL7 XML handler, and dump the data found
458 	to props.  
459 	
460 	returns true if we parsed ok, which means well-formed XML, and
461 	that's about it.  We just barely check against HL7 structure, and ignore any
462 	elements / text that is unexpected (that is, impossible in any HL7 message:
463 	independant of any message / segment definitions).
464 
465 	"message" should be an XML document with one top-level element -- that being
466 	the message.  (<ACK> or whatever).  We're only expecting one message to be in
467 	"message".
468 
469 	props can be null if you don't want the data (we still parse).  The message
470 	data found in message (that passes msgMask) will be added to props as key /
471 	value pairs with the key a toString() of the appropriate DatumPath for the
472 	location where the data is found (i.e. in the ZYX[a]-b[c]-d-e style), and
473 	the value the corresponding text.  So, after calling parseMessage
474 	successfully, if you wanted to retrieve the message data from props you
475 	might call something like 
476 	props.getProperty((new DatumPath()).add("MSH").add(1).toString())
477 	and that would return a String with "|", probably.
478 
479 	Note that this package facilitates the extraction of message data in a way
480 	independent of message version (i.e. components and whatever getting added):
481 
482 	With a message of "<FOO><ZYX><ZYX.42>fieldy-field-field</ZYX.42></ZYX></FOO>",
483 	"ZYX[0]-1[0]-1-1" will be the key that ends up in props (see notes at
484 	DatumPath.toString())
485 
486 	So if you, coding for a future version of the FOO message but
487 	recieving old-version message data, tried
488 	props.getProperty((new DatumPath()).add("ZYX").add(0).add(42).add(0).add(1).toString()) 
489 	with the message above (that is, trying to extract a repetition and
490 	component that aren't there), you would get "ZYX[0]-42[0]-1-1" mapping to 
491 	"fieldy-field-field" in the resulting props.  
492 
493 	If the message was
494 	"<FOO><ZYX><ZYX.42><ARG.1>component data</ARG.1></ZYX.42></ZYX></FOO>"
495 	and you, coding for an old version of this FOO message but recieving
496 	new-version FOO message data, tried 
497 	props.getProperty((new DatumPath()).add("ZYX").add(0).add(42).toString())
498 	you would get "ZYX[0]-42[0]-1-1" mapping to "component data" in the resulting 
499 	props.
500 
501 	msgMask lets you specify which parts of the message you want dumped to props.
502 	Passing in null gets you everything.  Otherwise, msgMask's elements should
503 	all be DatumPaths (! => ClassCastException), and a particular part of the
504 	message will be dumped to props only if it's location, as represented by a
505 	DatumPath, startsWith (as in DatumPath.startsWith()) at least one element of
506 	msgMask.  So if one element of msgMask was a (new DatumPath()).add(new
507 	String("ZYX")), then everything in all ZYX segment would get dumped to props.
508 	A (new DatumPath()).add(new String("ZYX")).add(1) would get only the first
509 	repetitions of same (if there is one) dumped to props.  etc. etc.  Note that
510 	a DatumPath of size() == 0 in msgMask will get you everything, no matter what
511 	the other elements of msgMask are, because all DatumPaths startsWith the
512 	zero-length DatumPath.
513 
514 	Segment group elements (eg. ADT_A01.PROCEDURE) are handled fine, but they
515 	aren't addressed in msgMask or in the output in props -- basically any
516 	element tags at the level immediately inside the message element, and having
517 	a name that starts with the message element name + '.', is ignored (meaning
518 	it's contents are dealt with the same as if the start and end tags' just 
519 	wasn't there.)
520 	*/
521 	public static boolean parseMessage(Properties props, String message, 
522 			Collection<DatumPath> msgMask) throws HL7Exception
523 	{
524 		boolean ret = false;
525 		try {
526 			SAXParserFactory factory = SAXParserFactory.newInstance();
527 			SAXParser parser = factory.newSAXParser();
528 
529 			InputSource inSrc = new InputSource(new java.io.StringReader(message));
530 
531 			HL7MessageHandler handler = new HL7MessageHandler();
532 			handler.m_props = (props != null 
533 				? props : new Properties()); // it's expecting a props.
534 
535 			if(msgMask != null)
536 				handler.m_msgMask = msgMask;
537 			else {
538 				handler.m_msgMask = new ArrayList<DatumPath>();
539 				handler.m_msgMask.add(new DatumPath());
540 			}
541 
542 			parser.parse(inSrc, handler);
543 			ret = true;
544         } catch (ParserConfigurationException e) {
545             throw new HL7Exception(e);
546         } catch (IOException e) {
547             throw new HL7Exception(e);
548         } catch (StopParsingException e) {
549             throw new HL7Exception(e);
550         } catch (SAXException e) {
551             throw new HL7Exception(e);
552         }
553 
554 		return ret;
555 	}
556 
557 	public static void main(String args[]) 
558 	{
559 		if(args.length >= 1) {
560 			Properties props = new Properties();
561 			List<DatumPath> msgMask = new ArrayList<DatumPath>();
562 			msgMask.add(new DatumPath().add("MSH").add(0).add(9));
563 			//msgMask.add(new DatumPath());
564 			boolean parseret;
565             try {
566                 parseret = XML.parseMessage(props, args[0], msgMask);
567                 System.err.println("parseMessage returned " + parseret);
568             } catch (HL7Exception e) {
569                 e.printStackTrace();
570             }
571 			props.list(System.err);
572 		}
573 	}
574 }
575