001    /*
002     * This file is part of muCommander, http://www.mucommander.com
003     * Copyright (C) 2002-2008 Maxence Bernard
004     *
005     * muCommander is free software; you can redistribute it and/or modify
006     * it under the terms of the GNU General Public License as published by
007     * the Free Software Foundation; either version 3 of the License, or
008     * (at your option) any later version.
009     *
010     * muCommander is distributed in the hope that it will be useful,
011     * but WITHOUT ANY WARRANTY; without even the implied warranty of
012     * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
013     * GNU General Public License for more details.
014     *
015     * You should have received a copy of the GNU General Public License
016     * along with this program.  If not, see <http://www.gnu.org/licenses/>.
017     */
018    
019    package com.mucommander.io.bom;
020    
021    import java.io.IOException;
022    import java.io.InputStream;
023    
024    /**
025     * <code>BOMInputStream</code> is an <code>InputStream</code> which provides support for Byte-Order Marks (BOM).
026     * A BOM is a byte sequence found at the beginning of a Unicode text stream which indicates the encoding of the text
027     * that follows.
028     *
029     * <p>
030     * This class serves a dual purpose:<br>
031     * 1) it allows to detect a BOM in the underlying stream and determine the encoding used by the stream:
032     * the {@link BOM} instance returned by {@link #getBOM()} provides that information.<br>
033     * 2) it allows to discard the BOM from a Unicode stream: the leading bytes corresponding to the BOM are swallowed by
034     * the stream and never returned by the <code>read</code> methods.
035     * </p>
036     *
037     *<p>
038     * The following BOMs are supported by this class:
039     * <ul>
040     *  <li>{@link #UTF8_BOM UTF-8}</li>
041     *  <li>{@link #UTF16_BE_BOM UTF-16 Big Endian}</li>
042     *  <li>{@link #UTF16_LE_BOM UTF-16 Little Endian}</li>
043     *  <li>{@link #UTF32_BE_BOM UTF-32 Big Endian}.</li>
044     *  <li>{@link #UTF32_LE_BOM UTF-32 Little Endian}</li>
045     * </ul>
046     * Note that UTF-32 encodings (both Little and Big Endians) are usually <b>not</b> supported by Java runtimes
047     * out of the box.
048     * <p>
049     *
050     * @see BOMReader
051     * @author Maxence Bernard
052     */
053    public class BOMInputStream extends InputStream implements BOMConstants {
054    
055        /** The underlying InputStream that feeds bytes to this stream */
056        private InputStream in;
057    
058        /** Contains the BOM that was detected in the stream, null if none was found */
059        private BOM bom;
060    
061        /** Bytes that were swallowed by this stream when searching for a BOM, null if a BOM was found */
062        private byte leadingBytes[];
063    
064        /** Current offset within the {@link #leadingBytes} array */
065        private int leadingBytesOff;
066    
067        /** Contains the max signature length of supported BOMs */
068        private final static int MAX_BOM_LENGTH;
069    
070        static {
071            // Calculates MAX_BOM_LENGTH
072            int maxLen = SUPPORTED_BOMS[0].getSignature().length;
073            int len;
074            for(int i=1; i<SUPPORTED_BOMS.length; i++) {
075                len = SUPPORTED_BOMS[i].getSignature().length;
076                if(len>maxLen)
077                    maxLen = len;
078            }
079    
080            MAX_BOM_LENGTH = maxLen;
081        }
082    
083    
084        /**
085         * Creates a new <code>BOMInputStream</code> and looks for a BOM at the beginning of the stream.
086         *
087         * @param in the underlying stream
088         * @throws IOException if an error occurred while reading the given InputStream
089         */
090        public BOMInputStream(InputStream in) throws IOException {
091            this.in = in;
092    
093            // Read up to MAX_BOM_LENGTH bytes
094            byte bytes[] = new byte[MAX_BOM_LENGTH];
095            int nbRead;
096            int totalRead = 0;
097            while((nbRead=in.read(bytes, totalRead, MAX_BOM_LENGTH-totalRead))!=-1 && (totalRead+=nbRead)<MAX_BOM_LENGTH);
098    
099            // Truncate the byte array if the stream ended before MAX_BOM_LENGTH
100            if(totalRead<MAX_BOM_LENGTH) {
101                byte tempBytes[] = new byte[totalRead];
102                System.arraycopy(bytes, 0, tempBytes, 0, totalRead);
103                bytes = tempBytes;
104            }
105    
106            int bestMatchLength = 0;
107            int bestMatchIndex = -1;
108            BOM tempBom;
109            byte[] tempBomSig;
110    
111            // Looks for the best (longest) signature match
112            for(int i=0; i<SUPPORTED_BOMS.length; i++) {
113                tempBom = SUPPORTED_BOMS[i];
114                tempBomSig = tempBom.getSignature();
115                if(tempBomSig.length>bestMatchLength && startsWith(bytes, tempBomSig)) {
116                    bestMatchIndex = i;
117                    bestMatchLength = tempBomSig.length;
118                }
119            }
120    
121            // Keep the bytes that do not correspond to a BOM to have the read methods return them
122            if(bestMatchIndex!=-1) {
123                bom = SUPPORTED_BOMS[bestMatchIndex];
124                if(bestMatchLength<MAX_BOM_LENGTH) {
125                    leadingBytes = bytes;
126                    leadingBytesOff = bestMatchLength;
127                }
128            }
129            else {
130                leadingBytes = bytes;
131                leadingBytesOff = 0;
132            }
133        }
134    
135        /**
136         * Returns <code>true</code> if the first byte sequence starts with the second byte sequence.
137         *
138         * @param b1 first byte array to test
139         * @param b2 second byte array to test
140         * @return true if the first byte sequence starts with the second byte sequence.
141         */
142        private static boolean startsWith(byte b1[], byte b2[]) {
143            int b1Len = b1.length;
144            int b2Len = b2.length;
145            if(b1Len<b2Len)
146                return false;
147    
148            for(int i=0; i<b2Len; i++) {
149                if(b2[i]!= b1[i])
150                    return false;
151            }
152    
153            return true;
154        }
155    
156        /**
157         * Returns the {@link BOM} that was found at the beginning of the stream if there was one,
158         * <code>null</code> otherwise.
159         *
160         * @return the BOM that was found at the beginning of the stream
161         */
162        public BOM getBOM() {
163            return bom;
164        }
165    
166    
167        ////////////////////////////////
168        // InputStream implementation //
169        ////////////////////////////////
170    
171        public int read() throws IOException {
172            if(leadingBytes==null)
173                return in.read();
174    
175            int i = leadingBytes[leadingBytesOff++];
176    
177            if(leadingBytesOff>=leadingBytes.length)
178                leadingBytes = null;
179    
180            return i;
181        }
182    
183        public int read(byte b[]) throws IOException {
184            return read(b, 0, b.length);
185        }
186    
187        public int read(byte b[], int off, int len) throws IOException {
188            if(leadingBytes==null)
189                return in.read(b, off, len);
190    
191            int nbBytes = Math.min(leadingBytes.length-leadingBytesOff, len);
192            System.arraycopy(leadingBytes, leadingBytesOff, b, off, nbBytes);
193    
194            leadingBytesOff += nbBytes;
195            if(leadingBytesOff>=leadingBytes.length)
196                leadingBytes = null;
197    
198            return nbBytes;
199        }
200    
201        public void close() throws IOException {
202            in.close();
203        }
204    }