001 /*
002 * This file is part of muCommander, http://www.mucommander.com
003 * Copyright (C) 2002-2008 Maxence Bernard
004 *
005 * muCommander is free software; you can redistribute it and/or modify
006 * it under the terms of the GNU General Public License as published by
007 * the Free Software Foundation; either version 3 of the License, or
008 * (at your option) any later version.
009 *
010 * muCommander is distributed in the hope that it will be useful,
011 * but WITHOUT ANY WARRANTY; without even the implied warranty of
012 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
013 * GNU General Public License for more details.
014 *
015 * You should have received a copy of the GNU General Public License
016 * along with this program. If not, see <http://www.gnu.org/licenses/>.
017 */
018
019 package com.mucommander.io.bom;
020
021 import java.io.IOException;
022 import java.io.InputStream;
023
024 /**
025 * <code>BOMInputStream</code> is an <code>InputStream</code> which provides support for Byte-Order Marks (BOM).
026 * A BOM is a byte sequence found at the beginning of a Unicode text stream which indicates the encoding of the text
027 * that follows.
028 *
029 * <p>
030 * This class serves a dual purpose:<br>
031 * 1) it allows to detect a BOM in the underlying stream and determine the encoding used by the stream:
032 * the {@link BOM} instance returned by {@link #getBOM()} provides that information.<br>
033 * 2) it allows to discard the BOM from a Unicode stream: the leading bytes corresponding to the BOM are swallowed by
034 * the stream and never returned by the <code>read</code> methods.
035 * </p>
036 *
037 *<p>
038 * The following BOMs are supported by this class:
039 * <ul>
040 * <li>{@link #UTF8_BOM UTF-8}</li>
041 * <li>{@link #UTF16_BE_BOM UTF-16 Big Endian}</li>
042 * <li>{@link #UTF16_LE_BOM UTF-16 Little Endian}</li>
043 * <li>{@link #UTF32_BE_BOM UTF-32 Big Endian}.</li>
044 * <li>{@link #UTF32_LE_BOM UTF-32 Little Endian}</li>
045 * </ul>
046 * Note that UTF-32 encodings (both Little and Big Endians) are usually <b>not</b> supported by Java runtimes
047 * out of the box.
048 * <p>
049 *
050 * @see BOMReader
051 * @author Maxence Bernard
052 */
053 public class BOMInputStream extends InputStream implements BOMConstants {
054
055 /** The underlying InputStream that feeds bytes to this stream */
056 private InputStream in;
057
058 /** Contains the BOM that was detected in the stream, null if none was found */
059 private BOM bom;
060
061 /** Bytes that were swallowed by this stream when searching for a BOM, null if a BOM was found */
062 private byte leadingBytes[];
063
064 /** Current offset within the {@link #leadingBytes} array */
065 private int leadingBytesOff;
066
067 /** Contains the max signature length of supported BOMs */
068 private final static int MAX_BOM_LENGTH;
069
070 static {
071 // Calculates MAX_BOM_LENGTH
072 int maxLen = SUPPORTED_BOMS[0].getSignature().length;
073 int len;
074 for(int i=1; i<SUPPORTED_BOMS.length; i++) {
075 len = SUPPORTED_BOMS[i].getSignature().length;
076 if(len>maxLen)
077 maxLen = len;
078 }
079
080 MAX_BOM_LENGTH = maxLen;
081 }
082
083
084 /**
085 * Creates a new <code>BOMInputStream</code> and looks for a BOM at the beginning of the stream.
086 *
087 * @param in the underlying stream
088 * @throws IOException if an error occurred while reading the given InputStream
089 */
090 public BOMInputStream(InputStream in) throws IOException {
091 this.in = in;
092
093 // Read up to MAX_BOM_LENGTH bytes
094 byte bytes[] = new byte[MAX_BOM_LENGTH];
095 int nbRead;
096 int totalRead = 0;
097 while((nbRead=in.read(bytes, totalRead, MAX_BOM_LENGTH-totalRead))!=-1 && (totalRead+=nbRead)<MAX_BOM_LENGTH);
098
099 // Truncate the byte array if the stream ended before MAX_BOM_LENGTH
100 if(totalRead<MAX_BOM_LENGTH) {
101 byte tempBytes[] = new byte[totalRead];
102 System.arraycopy(bytes, 0, tempBytes, 0, totalRead);
103 bytes = tempBytes;
104 }
105
106 int bestMatchLength = 0;
107 int bestMatchIndex = -1;
108 BOM tempBom;
109 byte[] tempBomSig;
110
111 // Looks for the best (longest) signature match
112 for(int i=0; i<SUPPORTED_BOMS.length; i++) {
113 tempBom = SUPPORTED_BOMS[i];
114 tempBomSig = tempBom.getSignature();
115 if(tempBomSig.length>bestMatchLength && startsWith(bytes, tempBomSig)) {
116 bestMatchIndex = i;
117 bestMatchLength = tempBomSig.length;
118 }
119 }
120
121 // Keep the bytes that do not correspond to a BOM to have the read methods return them
122 if(bestMatchIndex!=-1) {
123 bom = SUPPORTED_BOMS[bestMatchIndex];
124 if(bestMatchLength<MAX_BOM_LENGTH) {
125 leadingBytes = bytes;
126 leadingBytesOff = bestMatchLength;
127 }
128 }
129 else {
130 leadingBytes = bytes;
131 leadingBytesOff = 0;
132 }
133 }
134
135 /**
136 * Returns <code>true</code> if the first byte sequence starts with the second byte sequence.
137 *
138 * @param b1 first byte array to test
139 * @param b2 second byte array to test
140 * @return true if the first byte sequence starts with the second byte sequence.
141 */
142 private static boolean startsWith(byte b1[], byte b2[]) {
143 int b1Len = b1.length;
144 int b2Len = b2.length;
145 if(b1Len<b2Len)
146 return false;
147
148 for(int i=0; i<b2Len; i++) {
149 if(b2[i]!= b1[i])
150 return false;
151 }
152
153 return true;
154 }
155
156 /**
157 * Returns the {@link BOM} that was found at the beginning of the stream if there was one,
158 * <code>null</code> otherwise.
159 *
160 * @return the BOM that was found at the beginning of the stream
161 */
162 public BOM getBOM() {
163 return bom;
164 }
165
166
167 ////////////////////////////////
168 // InputStream implementation //
169 ////////////////////////////////
170
171 public int read() throws IOException {
172 if(leadingBytes==null)
173 return in.read();
174
175 int i = leadingBytes[leadingBytesOff++];
176
177 if(leadingBytesOff>=leadingBytes.length)
178 leadingBytes = null;
179
180 return i;
181 }
182
183 public int read(byte b[]) throws IOException {
184 return read(b, 0, b.length);
185 }
186
187 public int read(byte b[], int off, int len) throws IOException {
188 if(leadingBytes==null)
189 return in.read(b, off, len);
190
191 int nbBytes = Math.min(leadingBytes.length-leadingBytesOff, len);
192 System.arraycopy(leadingBytes, leadingBytesOff, b, off, nbBytes);
193
194 leadingBytesOff += nbBytes;
195 if(leadingBytesOff>=leadingBytes.length)
196 leadingBytes = null;
197
198 return nbBytes;
199 }
200
201 public void close() throws IOException {
202 in.close();
203 }
204 }