001 /**
002 * Licensed to the Apache Software Foundation (ASF) under one
003 * or more contributor license agreements. See the NOTICE file
004 * distributed with this work for additional information
005 * regarding copyright ownership. The ASF licenses this file
006 * to you under the Apache License, Version 2.0 (the
007 * "License"); you may not use this file except in compliance
008 * with the License. You may obtain a copy of the License at
009 *
010 * http://www.apache.org/licenses/LICENSE-2.0
011 *
012 * Unless required by applicable law or agreed to in writing, software
013 * distributed under the License is distributed on an "AS IS" BASIS,
014 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
015 * See the License for the specific language governing permissions and
016 * limitations under the License.
017 */
018
019 package org.apache.hadoop.io;
020
021 import java.io.IOException;
022 import java.io.DataInput;
023 import java.io.DataOutput;
024
025
026 import org.apache.commons.logging.*;
027 import org.apache.hadoop.classification.InterfaceAudience;
028 import org.apache.hadoop.classification.InterfaceStability;
029
030 /** A WritableComparable for strings that uses the UTF8 encoding.
031 *
032 * <p>Also includes utilities for efficiently reading and writing UTF-8.
033 *
034 * @deprecated replaced by Text
035 */
036 @Deprecated
037 @InterfaceAudience.LimitedPrivate({"HDFS", "MapReduce"})
038 @InterfaceStability.Stable
039 public class UTF8 implements WritableComparable<UTF8> {
040 private static final Log LOG= LogFactory.getLog(UTF8.class);
041 private static final DataInputBuffer IBUF = new DataInputBuffer();
042
043 private static final ThreadLocal<DataOutputBuffer> OBUF_FACTORY =
044 new ThreadLocal<DataOutputBuffer>(){
045 @Override
046 protected DataOutputBuffer initialValue() {
047 return new DataOutputBuffer();
048 }
049 };
050
051 private static final byte[] EMPTY_BYTES = new byte[0];
052
053 private byte[] bytes = EMPTY_BYTES;
054 private int length;
055
056 public UTF8() {
057 //set("");
058 }
059
060 /** Construct from a given string. */
061 public UTF8(String string) {
062 set(string);
063 }
064
065 /** Construct from a given string. */
066 public UTF8(UTF8 utf8) {
067 set(utf8);
068 }
069
070 /** The raw bytes. */
071 public byte[] getBytes() {
072 return bytes;
073 }
074
075 /** The number of bytes in the encoded string. */
076 public int getLength() {
077 return length;
078 }
079
080 /** Set to contain the contents of a string. */
081 public void set(String string) {
082 if (string.length() > 0xffff/3) { // maybe too long
083 LOG.warn("truncating long string: " + string.length()
084 + " chars, starting with " + string.substring(0, 20));
085 string = string.substring(0, 0xffff/3);
086 }
087
088 length = utf8Length(string); // compute length
089 if (length > 0xffff) // double-check length
090 throw new RuntimeException("string too long!");
091
092 if (bytes == null || length > bytes.length) // grow buffer
093 bytes = new byte[length];
094
095 try { // avoid sync'd allocations
096 DataOutputBuffer obuf = OBUF_FACTORY.get();
097 obuf.reset();
098 writeChars(obuf, string, 0, string.length());
099 System.arraycopy(obuf.getData(), 0, bytes, 0, length);
100 } catch (IOException e) {
101 throw new RuntimeException(e);
102 }
103 }
104
105 /** Set to contain the contents of a string. */
106 public void set(UTF8 other) {
107 length = other.length;
108 if (bytes == null || length > bytes.length) // grow buffer
109 bytes = new byte[length];
110 System.arraycopy(other.bytes, 0, bytes, 0, length);
111 }
112
113 public void readFields(DataInput in) throws IOException {
114 length = in.readUnsignedShort();
115 if (bytes == null || bytes.length < length)
116 bytes = new byte[length];
117 in.readFully(bytes, 0, length);
118 }
119
120 /** Skips over one UTF8 in the input. */
121 public static void skip(DataInput in) throws IOException {
122 int length = in.readUnsignedShort();
123 WritableUtils.skipFully(in, length);
124 }
125
126 public void write(DataOutput out) throws IOException {
127 out.writeShort(length);
128 out.write(bytes, 0, length);
129 }
130
131 /** Compare two UTF8s. */
132 @Override
133 public int compareTo(UTF8 o) {
134 return WritableComparator.compareBytes(bytes, 0, length,
135 o.bytes, 0, o.length);
136 }
137
138 /** Convert to a String. */
139 @Override
140 public String toString() {
141 StringBuilder buffer = new StringBuilder(length);
142 try {
143 synchronized (IBUF) {
144 IBUF.reset(bytes, length);
145 readChars(IBUF, buffer, length);
146 }
147 } catch (IOException e) {
148 throw new RuntimeException(e);
149 }
150 return buffer.toString();
151 }
152
153 /** Returns true iff <code>o</code> is a UTF8 with the same contents. */
154 @Override
155 public boolean equals(Object o) {
156 if (!(o instanceof UTF8))
157 return false;
158 UTF8 that = (UTF8)o;
159 if (this.length != that.length)
160 return false;
161 else
162 return WritableComparator.compareBytes(bytes, 0, length,
163 that.bytes, 0, that.length) == 0;
164 }
165
166 @Override
167 public int hashCode() {
168 return WritableComparator.hashBytes(bytes, length);
169 }
170
171 /** A WritableComparator optimized for UTF8 keys. */
172 public static class Comparator extends WritableComparator {
173 public Comparator() {
174 super(UTF8.class);
175 }
176
177 @Override
178 public int compare(byte[] b1, int s1, int l1,
179 byte[] b2, int s2, int l2) {
180 int n1 = readUnsignedShort(b1, s1);
181 int n2 = readUnsignedShort(b2, s2);
182 return compareBytes(b1, s1+2, n1, b2, s2+2, n2);
183 }
184 }
185
186 static { // register this comparator
187 WritableComparator.define(UTF8.class, new Comparator());
188 }
189
190 /// STATIC UTILITIES FROM HERE DOWN
191
192 /// These are probably not used much anymore, and might be removed...
193
194 /** Convert a string to a UTF-8 encoded byte array.
195 * @see String#getBytes(String)
196 */
197 public static byte[] getBytes(String string) {
198 byte[] result = new byte[utf8Length(string)];
199 try { // avoid sync'd allocations
200 DataOutputBuffer obuf = OBUF_FACTORY.get();
201 obuf.reset();
202 writeChars(obuf, string, 0, string.length());
203 System.arraycopy(obuf.getData(), 0, result, 0, obuf.getLength());
204 } catch (IOException e) {
205 throw new RuntimeException(e);
206 }
207 return result;
208 }
209
210 /** Read a UTF-8 encoded string.
211 *
212 * @see DataInput#readUTF()
213 */
214 public static String readString(DataInput in) throws IOException {
215 int bytes = in.readUnsignedShort();
216 StringBuilder buffer = new StringBuilder(bytes);
217 readChars(in, buffer, bytes);
218 return buffer.toString();
219 }
220
221 private static void readChars(DataInput in, StringBuilder buffer, int nBytes)
222 throws IOException {
223 DataOutputBuffer obuf = OBUF_FACTORY.get();
224 obuf.reset();
225 obuf.write(in, nBytes);
226 byte[] bytes = obuf.getData();
227 int i = 0;
228 while (i < nBytes) {
229 byte b = bytes[i++];
230 if ((b & 0x80) == 0) {
231 buffer.append((char)(b & 0x7F));
232 } else if ((b & 0xE0) != 0xE0) {
233 buffer.append((char)(((b & 0x1F) << 6)
234 | (bytes[i++] & 0x3F)));
235 } else {
236 buffer.append((char)(((b & 0x0F) << 12)
237 | ((bytes[i++] & 0x3F) << 6)
238 | (bytes[i++] & 0x3F)));
239 }
240 }
241 }
242
243 /** Write a UTF-8 encoded string.
244 *
245 * @see DataOutput#writeUTF(String)
246 */
247 public static int writeString(DataOutput out, String s) throws IOException {
248 if (s.length() > 0xffff/3) { // maybe too long
249 LOG.warn("truncating long string: " + s.length()
250 + " chars, starting with " + s.substring(0, 20));
251 s = s.substring(0, 0xffff/3);
252 }
253
254 int len = utf8Length(s);
255 if (len > 0xffff) // double-check length
256 throw new IOException("string too long!");
257
258 out.writeShort(len);
259 writeChars(out, s, 0, s.length());
260 return len;
261 }
262
263 /** Returns the number of bytes required to write this. */
264 private static int utf8Length(String string) {
265 int stringLength = string.length();
266 int utf8Length = 0;
267 for (int i = 0; i < stringLength; i++) {
268 int c = string.charAt(i);
269 if (c <= 0x007F) {
270 utf8Length++;
271 } else if (c > 0x07FF) {
272 utf8Length += 3;
273 } else {
274 utf8Length += 2;
275 }
276 }
277 return utf8Length;
278 }
279
280 private static void writeChars(DataOutput out,
281 String s, int start, int length)
282 throws IOException {
283 final int end = start + length;
284 for (int i = start; i < end; i++) {
285 int code = s.charAt(i);
286 if (code <= 0x7F) {
287 out.writeByte((byte)code);
288 } else if (code <= 0x07FF) {
289 out.writeByte((byte)(0xC0 | ((code >> 6) & 0x1F)));
290 out.writeByte((byte)(0x80 | code & 0x3F));
291 } else {
292 out.writeByte((byte)(0xE0 | ((code >> 12) & 0X0F)));
293 out.writeByte((byte)(0x80 | ((code >> 6) & 0x3F)));
294 out.writeByte((byte)(0x80 | (code & 0x3F)));
295 }
296 }
297 }
298
299 }