001// Copyright 2009-2013 The Apache Software Foundation
002//
003// Licensed under the Apache License, Version 2.0 (the "License");
004// you may not use this file except in compliance with the License.
005// You may obtain a copy of the License at
006//
007//     http://www.apache.org/licenses/LICENSE-2.0
008//
009// Unless required by applicable law or agreed to in writing, software
010// distributed under the License is distributed on an "AS IS" BASIS,
011// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
012// See the License for the specific language governing permissions and
013// limitations under the License.
014
015package org.apache.tapestry5.internal.services;
016
017import org.apache.tapestry5.ioc.Location;
018import org.apache.tapestry5.ioc.Resource;
019import org.apache.tapestry5.ioc.internal.util.CollectionFactory;
020import org.apache.tapestry5.ioc.internal.util.InternalUtils;
021import org.apache.tapestry5.ioc.internal.util.LocationImpl;
022import org.apache.tapestry5.ioc.util.ExceptionUtils;
023import org.xml.sax.*;
024import org.xml.sax.ext.Attributes2;
025import org.xml.sax.ext.LexicalHandler;
026import org.xml.sax.helpers.XMLReaderFactory;
027
028import javax.xml.namespace.QName;
029import java.io.*;
030import java.net.URL;
031import java.util.Collections;
032import java.util.List;
033import java.util.Map;
034
035/**
036 * Parses a document as a stream of XML tokens. It includes a special hack (as of Tapestry 5.3) to support the HTML5 doctype ({@code <!DOCTYPE html>})
037 * as if it were the XHTML transitional doctype
038 * ({@code <!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN" "http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd">}).
039 */
040public class XMLTokenStream
041{
042
043    public static final String TRANSITIONAL_DOCTYPE = "<!DOCTYPE html PUBLIC \"-//W3C//DTD XHTML 1.0 Transitional//EN\" \"http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd\">";
044
045    private static final DTDData HTML5_DTD_DATA = new DTDData("html", null, null);
046
047    private final class SaxHandler implements LexicalHandler, EntityResolver, ContentHandler
048    {
049        private Locator locator;
050
051        private int currentLine = -1;
052
053        private Location cachedLocation;
054
055        private Location textLocation;
056
057        private final StringBuilder builder = new StringBuilder();
058
059        private boolean inCDATA, insideDTD;
060
061        private List<NamespaceMapping> namespaceMappings = CollectionFactory.newList();
062
063        private Location getLocation()
064        {
065            int line = locator.getLineNumber();
066
067            if (currentLine != line)
068                cachedLocation = null;
069
070            if (cachedLocation == null)
071            {
072                // lineOffset accounts for the extra line when a doctype is injected. The line number reported
073                // from the XML parser inlcudes the phantom doctype line, the lineOffset is used to subtract one
074                // to get the real line number.
075                cachedLocation = new LocationImpl(resource, line + lineOffset);
076            }
077
078            return cachedLocation;
079        }
080
081        private XMLToken add(XMLTokenType type)
082        {
083            XMLToken token = new XMLToken(type, getLocation());
084
085            tokens.add(token);
086
087            return token;
088        }
089
090        public InputSource resolveEntity(String publicId, String systemId) throws SAXException,
091                IOException
092        {
093            URL url = publicIdToURL.get(publicId);
094
095            try
096            {
097                if (url != null)
098                    return new InputSource(url.openStream());
099            } catch (IOException ex)
100            {
101                throw new SAXException(String.format("Unable to open stream for resource %s: %s",
102                        url, ExceptionUtils.toMessage(ex)), ex);
103            }
104
105            return null;
106        }
107
108        public void comment(char[] ch, int start, int length) throws SAXException
109        {
110            if (insideDTD)
111                return;
112
113            // TODO: Coalesce?
114            add(XMLTokenType.COMMENT).text = new String(ch, start, length);
115        }
116
117        public void startCDATA() throws SAXException
118        {
119            // TODO: Flush characters?
120
121            inCDATA = true;
122        }
123
124        public void endCDATA() throws SAXException
125        {
126            if (builder.length() != 0)
127            {
128                add(XMLTokenType.CDATA).text = builder.toString();
129            }
130
131            builder.setLength(0);
132            inCDATA = false;
133        }
134
135        public void characters(char[] ch, int start, int length) throws SAXException
136        {
137            if (inCDATA)
138            {
139                builder.append(ch, start, length);
140                return;
141            }
142
143            XMLToken token = new XMLToken(XMLTokenType.CHARACTERS, textLocation);
144            token.text = new String(ch, start, length);
145
146            tokens.add(token);
147        }
148
149        public void ignorableWhitespace(char[] ch, int start, int length) throws SAXException
150        {
151            characters(ch, start, length);
152        }
153
154        public void startDTD(final String name, final String publicId, final String systemId)
155                throws SAXException
156        {
157            insideDTD = true;
158
159            if (!ignoreDTD)
160            {
161                DTDData data = html5DTD ? HTML5_DTD_DATA : new DTDData(name, publicId, systemId);
162
163                add(XMLTokenType.DTD).dtdData = data;
164            }
165        }
166
167        public void endDocument() throws SAXException
168        {
169            add(XMLTokenType.END_DOCUMENT);
170        }
171
172        public void endElement(String uri, String localName, String qName) throws SAXException
173        {
174            add(XMLTokenType.END_ELEMENT);
175        }
176
177        public void setDocumentLocator(Locator locator)
178        {
179            this.locator = locator;
180        }
181
182        /**
183         * Checks for the extra namespace injected when the transitional doctype is injected (which
184         * occurs when the template contains no doctype).
185         */
186        private boolean ignoreURI(String uri)
187        {
188            return ignoreDTD && uri.equals("http://www.w3.org/1999/xhtml");
189        }
190
191        public void startElement(String uri, String localName, String qName, Attributes attributes)
192                throws SAXException
193        {
194            XMLToken token = add(XMLTokenType.START_ELEMENT);
195
196            token.uri = ignoreURI(uri) ? "" : uri;
197            token.localName = localName;
198            token.qName = qName;
199
200            // The XML parser tends to reuse the same Attributes object, so
201            // capture the data out of it.
202
203            Attributes2 a2 = (attributes instanceof Attributes2) ? (Attributes2) attributes : null;
204
205            if (attributes.getLength() == 0)
206            {
207                token.attributes = Collections.emptyList();
208            } else
209            {
210                token.attributes = CollectionFactory.newList();
211
212                for (int i = 0; i < attributes.getLength(); i++)
213                {
214                    // Filter out attributes that are not present in the XML input stream, but were
215                    // instead provided by DTD defaulting.
216
217                    if (a2 != null && !a2.isSpecified(i))
218                    {
219                        continue;
220                    }
221
222                    String prefixedName = attributes.getQName(i);
223
224                    int lastColon = prefixedName.lastIndexOf(':');
225
226                    String prefix = lastColon > 0 ? prefixedName.substring(0, lastColon) : "";
227
228                    QName qname = new QName(attributes.getURI(i), attributes.getLocalName(i),
229                            prefix);
230
231                    token.attributes.add(new AttributeInfo(qname, attributes.getValue(i)));
232                }
233            }
234
235            token.namespaceMappings = CollectionFactory.newList(namespaceMappings);
236
237            namespaceMappings.clear();
238
239            // Any text collected starts here as well:
240
241            textLocation = getLocation();
242        }
243
244        public void startPrefixMapping(String prefix, String uri) throws SAXException
245        {
246            if (ignoreDTD && prefix.equals("") && uri.equals("http://www.w3.org/1999/xhtml"))
247            {
248                return;
249            }
250
251            namespaceMappings.add(new NamespaceMapping(prefix, uri));
252        }
253
254        public void endDTD() throws SAXException
255        {
256            insideDTD = false;
257        }
258
259        public void endEntity(String name) throws SAXException
260        {
261        }
262
263        public void startEntity(String name) throws SAXException
264        {
265        }
266
267        public void endPrefixMapping(String prefix) throws SAXException
268        {
269        }
270
271        public void processingInstruction(String target, String data) throws SAXException
272        {
273        }
274
275        public void skippedEntity(String name) throws SAXException
276        {
277        }
278
279        public void startDocument() throws SAXException
280        {
281        }
282    }
283
284    private int cursor = -1;
285
286    private final List<XMLToken> tokens = CollectionFactory.newList();
287
288    private final Resource resource;
289
290    private final Map<String, URL> publicIdToURL;
291
292    private Location exceptionLocation;
293
294    private boolean html5DTD, ignoreDTD;
295
296    private int lineOffset;
297
298    public XMLTokenStream(Resource resource, Map<String, URL> publicIdToURL)
299    {
300        this.resource = resource;
301        this.publicIdToURL = publicIdToURL;
302    }
303
304    public void parse() throws SAXException, IOException
305    {
306        SaxHandler handler = new SaxHandler();
307
308        XMLReader reader = XMLReaderFactory.createXMLReader();
309
310        reader.setContentHandler(handler);
311        reader.setEntityResolver(handler);
312        reader.setProperty("http://xml.org/sax/properties/lexical-handler", handler);
313
314        InputStream stream = openStream();
315
316        try
317        {
318            reader.parse(new InputSource(stream));
319        } catch (IOException ex)
320        {
321            this.exceptionLocation = handler.getLocation();
322
323            throw ex;
324        } catch (SAXException ex)
325        {
326            this.exceptionLocation = handler.getLocation();
327
328            throw ex;
329        } catch (RuntimeException ex)
330        {
331            this.exceptionLocation = handler.getLocation();
332
333            throw ex;
334        } finally
335        {
336            InternalUtils.close(stream);
337        }
338    }
339
340    enum State
341    {
342        MAYBE_XML, MAYBE_DOCTYPE, JUST_COPY
343    }
344
345    private InputStream openStream() throws IOException
346    {
347        InputStream rawStream = resource.openStream();
348        
349        String transformationEncoding = "UTF8";
350
351        InputStreamReader rawReader = new InputStreamReader(rawStream, transformationEncoding);
352        LineNumberReader reader = new LineNumberReader(rawReader);
353
354        ByteArrayOutputStream bos = new ByteArrayOutputStream(5000);
355        PrintWriter writer = new PrintWriter(new OutputStreamWriter(bos, transformationEncoding));
356
357        State state = State.MAYBE_XML;
358
359        try
360        {
361            while (true)
362            {
363                String line = reader.readLine();
364
365                if (line == null)
366                {
367                    break;
368                }
369
370                switch (state)
371                {
372
373                    case MAYBE_XML:
374
375                        if (line.toLowerCase().startsWith("<?xml"))
376                        {
377                            writer.println(line);
378                            state = State.MAYBE_DOCTYPE;
379                            continue;
380                        }
381
382                    case MAYBE_DOCTYPE:
383
384                        if (line.trim().length() == 0)
385                        {
386                            writer.println(line);
387                            continue;
388                        }
389
390                        String lineLower = line.toLowerCase();
391
392                        if (lineLower.equals("<!doctype html>"))
393                        {
394                            html5DTD = true;
395                            writer.println(TRANSITIONAL_DOCTYPE);
396                            state = State.JUST_COPY;
397                            continue;
398                        }
399
400
401                        if (lineLower.startsWith("<!doctype"))
402                        {
403                            writer.println(line);
404                            state = State.JUST_COPY;
405                            continue;
406                        }
407
408                        // No doctype, let's provide one.
409
410                        ignoreDTD = true;
411                        lineOffset = -1;
412                        writer.println(TRANSITIONAL_DOCTYPE);
413
414                        state = State.JUST_COPY;
415
416                        // And drop down to writing out the actual line, and all following lines.
417
418                    case JUST_COPY:
419                        writer.println(line);
420                }
421            }
422        } finally
423        {
424            writer.close();
425            reader.close();
426        }
427
428        return new ByteArrayInputStream(bos.toByteArray());
429    }
430
431    private XMLToken token()
432    {
433        return tokens.get(cursor);
434    }
435
436    /**
437     * Returns the type of the next token.
438     */
439    public XMLTokenType next()
440    {
441        cursor++;
442
443        // TODO: Check for overflow?
444
445        return getEventType();
446    }
447
448    public int getAttributeCount()
449    {
450        return token().attributes.size();
451    }
452
453    public QName getAttributeName(int i)
454    {
455        return token().attributes.get(i).attributeName;
456    }
457
458    public DTDData getDTDInfo()
459    {
460        return token().dtdData;
461    }
462
463    public XMLTokenType getEventType()
464    {
465        return token().type;
466    }
467
468    public String getLocalName()
469    {
470        return token().localName;
471    }
472
473    public Location getLocation()
474    {
475        if (exceptionLocation != null)
476            return exceptionLocation;
477
478        return token().getLocation();
479    }
480
481    public int getNamespaceCount()
482    {
483        return token().namespaceMappings.size();
484    }
485
486    public String getNamespacePrefix(int i)
487    {
488        return token().namespaceMappings.get(i).prefix;
489    }
490
491    public String getNamespaceURI()
492    {
493        return token().uri;
494    }
495
496    public String getNamespaceURI(int i)
497    {
498        return token().namespaceMappings.get(i).uri;
499    }
500
501    public String getText()
502    {
503        return token().text;
504    }
505
506    public boolean hasNext()
507    {
508        return cursor < tokens.size() - 1;
509    }
510
511    public String getAttributeValue(int i)
512    {
513        return token().attributes.get(i).value;
514    }
515
516}