001// Copyright 2009-2013 The Apache Software Foundation
002//
003// Licensed under the Apache License, Version 2.0 (the "License");
004// you may not use this file except in compliance with the License.
005// You may obtain a copy of the License at
006//
007//     http://www.apache.org/licenses/LICENSE-2.0
008//
009// Unless required by applicable law or agreed to in writing, software
010// distributed under the License is distributed on an "AS IS" BASIS,
011// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
012// See the License for the specific language governing permissions and
013// limitations under the License.
014
015package org.apache.tapestry5.internal.services;
016
017import org.apache.tapestry5.ioc.Location;
018import org.apache.tapestry5.ioc.Resource;
019import org.apache.tapestry5.ioc.internal.util.CollectionFactory;
020import org.apache.tapestry5.ioc.internal.util.InternalUtils;
021import org.apache.tapestry5.ioc.internal.util.LocationImpl;
022import org.apache.tapestry5.ioc.util.ExceptionUtils;
023import org.xml.sax.*;
024import org.xml.sax.ext.Attributes2;
025import org.xml.sax.ext.LexicalHandler;
026import org.xml.sax.helpers.XMLReaderFactory;
027
028import javax.xml.namespace.QName;
029import java.io.*;
030import java.net.URL;
031import java.util.Collections;
032import java.util.List;
033import java.util.Map;
034
035/**
036 * Parses a document as a stream of XML tokens. It includes a special hack (as of Tapestry 5.3) to support the HTML5 doctype ({@code <!DOCTYPE html>})
037 * as if it were the XHTML transitional doctype
038 * ({@code <!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN" "http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd">}).
039 */
040public class XMLTokenStream
041{
042
043    public static final String TRANSITIONAL_DOCTYPE = "<!DOCTYPE html PUBLIC \"-//W3C//DTD XHTML 1.0 Transitional//EN\" \"http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd\">";
044
045    private static final DTDData HTML5_DTD_DATA = new DTDData("html", null, null);
046
047    private final class SaxHandler implements LexicalHandler, EntityResolver, ContentHandler
048    {
049        private Locator locator;
050
051        private int currentLine = -1;
052
053        private Location cachedLocation;
054
055        private Location textLocation;
056
057        private final StringBuilder builder = new StringBuilder();
058
059        private boolean inCDATA, insideDTD;
060
061        private List<NamespaceMapping> namespaceMappings = CollectionFactory.newList();
062
063        private Location getLocation()
064        {
065            if (locator == null)
066            {
067                if (cachedLocation == null)
068                {
069                    cachedLocation = new LocationImpl(resource);
070                }
071            } else {
072                int line = locator.getLineNumber();
073
074                if (currentLine != line)
075                    cachedLocation = null;
076
077                if (cachedLocation == null)
078                {
079                    // lineOffset accounts for the extra line when a doctype is injected. The line number reported
080                    // from the XML parser inlcudes the phantom doctype line, the lineOffset is used to subtract one
081                    // to get the real line number.
082                    cachedLocation = new LocationImpl(resource, line + lineOffset);
083                }
084            }
085
086            return cachedLocation;
087        }
088
089        private XMLToken add(XMLTokenType type)
090        {
091            XMLToken token = new XMLToken(type, getLocation());
092
093            tokens.add(token);
094
095            return token;
096        }
097
098        public InputSource resolveEntity(String publicId, String systemId) throws SAXException,
099                IOException
100        {
101            URL url = publicIdToURL.get(publicId);
102
103            try
104            {
105                if (url != null)
106                    return new InputSource(url.openStream());
107            } catch (IOException ex)
108            {
109                throw new SAXException(String.format("Unable to open stream for resource %s: %s",
110                        url, ExceptionUtils.toMessage(ex)), ex);
111            }
112
113            return null;
114        }
115
116        public void comment(char[] ch, int start, int length) throws SAXException
117        {
118            if (insideDTD)
119                return;
120
121            // TODO: Coalesce?
122            add(XMLTokenType.COMMENT).text = new String(ch, start, length);
123        }
124
125        public void startCDATA() throws SAXException
126        {
127            // TODO: Flush characters?
128
129            inCDATA = true;
130        }
131
132        public void endCDATA() throws SAXException
133        {
134            if (builder.length() != 0)
135            {
136                add(XMLTokenType.CDATA).text = builder.toString();
137            }
138
139            builder.setLength(0);
140            inCDATA = false;
141        }
142
143        public void characters(char[] ch, int start, int length) throws SAXException
144        {
145            if (inCDATA)
146            {
147                builder.append(ch, start, length);
148                return;
149            }
150
151            XMLToken token = new XMLToken(XMLTokenType.CHARACTERS, textLocation);
152            token.text = new String(ch, start, length);
153
154            tokens.add(token);
155        }
156
157        public void ignorableWhitespace(char[] ch, int start, int length) throws SAXException
158        {
159            characters(ch, start, length);
160        }
161
162        public void startDTD(final String name, final String publicId, final String systemId)
163                throws SAXException
164        {
165            insideDTD = true;
166
167            if (!ignoreDTD)
168            {
169                DTDData data = html5DTD ? HTML5_DTD_DATA : new DTDData(name, publicId, systemId);
170
171                add(XMLTokenType.DTD).dtdData = data;
172            }
173        }
174
175        public void endDocument() throws SAXException
176        {
177            add(XMLTokenType.END_DOCUMENT);
178        }
179
180        public void endElement(String uri, String localName, String qName) throws SAXException
181        {
182            add(XMLTokenType.END_ELEMENT);
183        }
184
185        public void setDocumentLocator(Locator locator)
186        {
187            this.locator = locator;
188        }
189
190        /**
191         * Checks for the extra namespace injected when the transitional doctype is injected (which
192         * occurs when the template contains no doctype).
193         */
194        private boolean ignoreURI(String uri)
195        {
196            return ignoreDTD && uri.equals("http://www.w3.org/1999/xhtml");
197        }
198
199        public void startElement(String uri, String localName, String qName, Attributes attributes)
200                throws SAXException
201        {
202            XMLToken token = add(XMLTokenType.START_ELEMENT);
203
204            token.uri = ignoreURI(uri) ? "" : uri;
205            token.localName = localName;
206            token.qName = qName;
207
208            // The XML parser tends to reuse the same Attributes object, so
209            // capture the data out of it.
210
211            Attributes2 a2 = (attributes instanceof Attributes2) ? (Attributes2) attributes : null;
212
213            if (attributes.getLength() == 0)
214            {
215                token.attributes = Collections.emptyList();
216            } else
217            {
218                token.attributes = CollectionFactory.newList();
219
220                for (int i = 0; i < attributes.getLength(); i++)
221                {
222                    // Filter out attributes that are not present in the XML input stream, but were
223                    // instead provided by DTD defaulting.
224
225                    if (a2 != null && !a2.isSpecified(i))
226                    {
227                        continue;
228                    }
229
230                    String prefixedName = attributes.getQName(i);
231
232                    int lastColon = prefixedName.lastIndexOf(':');
233
234                    String prefix = lastColon > 0 ? prefixedName.substring(0, lastColon) : "";
235
236                    QName qname = new QName(attributes.getURI(i), attributes.getLocalName(i),
237                            prefix);
238
239                    token.attributes.add(new AttributeInfo(qname, attributes.getValue(i)));
240                }
241            }
242
243            token.namespaceMappings = CollectionFactory.newList(namespaceMappings);
244
245            namespaceMappings.clear();
246
247            // Any text collected starts here as well:
248
249            textLocation = getLocation();
250        }
251
252        public void startPrefixMapping(String prefix, String uri) throws SAXException
253        {
254            if (ignoreDTD && prefix.equals("") && uri.equals("http://www.w3.org/1999/xhtml"))
255            {
256                return;
257            }
258
259            namespaceMappings.add(new NamespaceMapping(prefix, uri));
260        }
261
262        public void endDTD() throws SAXException
263        {
264            insideDTD = false;
265        }
266
267        public void endEntity(String name) throws SAXException
268        {
269        }
270
271        public void startEntity(String name) throws SAXException
272        {
273        }
274
275        public void endPrefixMapping(String prefix) throws SAXException
276        {
277        }
278
279        public void processingInstruction(String target, String data) throws SAXException
280        {
281        }
282
283        public void skippedEntity(String name) throws SAXException
284        {
285        }
286
287        public void startDocument() throws SAXException
288        {
289        }
290    }
291
292    private int cursor = -1;
293
294    private final List<XMLToken> tokens = CollectionFactory.newList();
295
296    private final Resource resource;
297
298    private final Map<String, URL> publicIdToURL;
299
300    private Location exceptionLocation;
301
302    private boolean html5DTD, ignoreDTD;
303
304    private int lineOffset;
305
306    public XMLTokenStream(Resource resource, Map<String, URL> publicIdToURL)
307    {
308        this.resource = resource;
309        this.publicIdToURL = publicIdToURL;
310    }
311
312    public void parse() throws SAXException, IOException
313    {
314        SaxHandler handler = new SaxHandler();
315
316        XMLReader reader = XMLReaderFactory.createXMLReader();
317
318        reader.setContentHandler(handler);
319        reader.setEntityResolver(handler);
320        reader.setProperty("http://xml.org/sax/properties/lexical-handler", handler);
321
322        InputStream stream = null;
323
324        try
325        {
326            stream = openStream();
327            reader.parse(new InputSource(stream));
328        } catch (IOException ex)
329        {
330            this.exceptionLocation = handler.getLocation();
331
332            throw ex;
333        } catch (SAXException ex)
334        {
335            this.exceptionLocation = handler.getLocation();
336
337            throw ex;
338        } catch (RuntimeException ex)
339        {
340            this.exceptionLocation = handler.getLocation();
341
342            throw ex;
343        } finally
344        {
345            InternalUtils.close(stream);
346        }
347    }
348
349    enum State
350    {
351        MAYBE_XML, MAYBE_DOCTYPE, JUST_COPY
352    }
353
354    private InputStream openStream() throws IOException
355    {
356        InputStream rawStream = resource.openStream();
357
358        String transformationEncoding = "UTF8";
359
360        InputStreamReader rawReader = new InputStreamReader(rawStream, transformationEncoding);
361        LineNumberReader reader = new LineNumberReader(rawReader);
362
363        ByteArrayOutputStream bos = new ByteArrayOutputStream(5000);
364        PrintWriter writer = new PrintWriter(new OutputStreamWriter(bos, transformationEncoding));
365
366        State state = State.MAYBE_XML;
367
368        try
369        {
370            while (true)
371            {
372                String line = reader.readLine();
373
374                if (line == null)
375                {
376                    break;
377                }
378
379                switch (state)
380                {
381
382                    case MAYBE_XML:
383
384                        if (line.toLowerCase().startsWith("<?xml"))
385                        {
386                            writer.println(line);
387                            state = State.MAYBE_DOCTYPE;
388                            continue;
389                        }
390
391                    case MAYBE_DOCTYPE:
392
393                        if (line.trim().length() == 0)
394                        {
395                            writer.println(line);
396                            continue;
397                        }
398
399                        String lineLower = line.toLowerCase();
400
401                        if (lineLower.equals("<!doctype html>"))
402                        {
403                            html5DTD = true;
404                            writer.println(TRANSITIONAL_DOCTYPE);
405                            state = State.JUST_COPY;
406                            continue;
407                        }
408
409
410                        if (lineLower.startsWith("<!doctype"))
411                        {
412                            writer.println(line);
413                            state = State.JUST_COPY;
414                            continue;
415                        }
416
417                        // No doctype, let's provide one.
418
419                        ignoreDTD = true;
420                        lineOffset = -1;
421                        writer.println(TRANSITIONAL_DOCTYPE);
422
423                        state = State.JUST_COPY;
424
425                        // And drop down to writing out the actual line, and all following lines.
426
427                    case JUST_COPY:
428                        writer.println(line);
429                }
430            }
431        } finally
432        {
433            writer.close();
434            reader.close();
435        }
436
437        return new ByteArrayInputStream(bos.toByteArray());
438    }
439
440    private XMLToken token()
441    {
442        return cursor == -1 ? null : tokens.get(cursor);
443    }
444
445    /**
446     * Returns the type of the next token.
447     */
448    public XMLTokenType next()
449    {
450        cursor++;
451
452        // TODO: Check for overflow?
453
454        return getEventType();
455    }
456
457    public int getAttributeCount()
458    {
459        return token().attributes.size();
460    }
461
462    public QName getAttributeName(int i)
463    {
464        return token().attributes.get(i).attributeName;
465    }
466
467    public DTDData getDTDInfo()
468    {
469        return token().dtdData;
470    }
471
472    public XMLTokenType getEventType()
473    {
474        return token().type;
475    }
476
477    public String getLocalName()
478    {
479        return token().localName;
480    }
481
482    public Location getLocation()
483    {
484        if (exceptionLocation != null)
485            return exceptionLocation;
486
487        return token().getLocation();
488    }
489
490    public int getNamespaceCount()
491    {
492        return token().namespaceMappings.size();
493    }
494
495    public String getNamespacePrefix(int i)
496    {
497        return token().namespaceMappings.get(i).prefix;
498    }
499
500    public String getNamespaceURI()
501    {
502        return token().uri;
503    }
504
505    public String getNamespaceURI(int i)
506    {
507        return token().namespaceMappings.get(i).uri;
508    }
509
510    public String getText()
511    {
512        return token().text;
513    }
514
515    public boolean hasNext()
516    {
517        return cursor < tokens.size() - 1;
518    }
519
520    public String getAttributeValue(int i)
521    {
522        return token().attributes.get(i).value;
523    }
524
525}