001 /* 002 * Copyright 2010-2015 JetBrains s.r.o. 003 * 004 * Licensed under the Apache License, Version 2.0 (the "License"); 005 * you may not use this file except in compliance with the License. 006 * You may obtain a copy of the License at 007 * 008 * http://www.apache.org/licenses/LICENSE-2.0 009 * 010 * Unless required by applicable law or agreed to in writing, software 011 * distributed under the License is distributed on an "AS IS" BASIS, 012 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 013 * See the License for the specific language governing permissions and 014 * limitations under the License. 015 */ 016 017 package org.jetbrains.kotlin.serialization.jvm; 018 019 import org.jetbrains.annotations.NotNull; 020 021 import java.util.ArrayList; 022 import java.util.List; 023 024 public class BitEncoding { 025 private BitEncoding() { 026 } 027 028 /** 029 * Converts a byte array of serialized data to an array of {@code String} satisfying JVM annotation value argument restrictions: 030 * <ol> 031 * <li>Each string's length should be no more than 65535</li> 032 * <li>UTF-8 representation of each string cannot contain bytes in the range 0xf0..0xff</li> 033 * </ol> 034 */ 035 @NotNull 036 public static String[] encodeBytes(@NotNull byte[] data) { 037 byte[] bytes = encode8to7(data); 038 // Since 0x0 byte is encoded as two bytes in the Modified UTF-8 (0xc0 0x80) and zero is rather common to byte arrays, we increment 039 // every byte by one modulo max byte value, so that the less common value 0x7f will be represented as two bytes instead. 040 addModuloByte(bytes, 1); 041 return splitBytesToStringArray(bytes); 042 } 043 044 /** 045 * Converts a byte array to another byte array, every element of which is in the range 0x0..0x7f. 046 * 047 * The conversion is equivalent to the following: input bytes are combined into one long bit string. This big string is then split into 048 * groups of 7 bits. Each resulting 7-bit chunk is then converted to a byte (with a leading bit = 0). The last chunk may have less than 049 * 7 bits, it's prepended with zeros to form a byte. The result is then the array of these bytes, each of which is obviously in the 050 * range 0x0..0x7f. 051 * 052 * Suppose the input of 4 bytes is given (bytes are listed from the beginning to the end, each byte from the least significant bit to 053 * the most significant bit, bits within each byte are numbered): 054 * 055 * 01234567 01234567 01234567 01234567 056 * 057 * The output for this kind of input will be of the following form ('#' represents a zero bit): 058 * 059 * 0123456# 7012345# 6701234# 5670123# 4567#### 060 */ 061 @NotNull 062 private static byte[] encode8to7(@NotNull byte[] data) { 063 // ceil(data.length * 8 / 7) 064 int resultLength = (data.length * 8 + 6) / 7; 065 byte[] result = new byte[resultLength]; 066 067 // We maintain a pointer to the bit in the input, which is represented by two numbers: index of the current byte in the input and 068 // the index of a bit inside this byte (0 is least significant, 7 is most significant) 069 int byteIndex = 0; 070 int bit = 0; 071 072 // Write all resulting bytes except the last one. To do this we need to collect exactly 7 bits, starting from the current, into a 073 // byte. In almost all cases these 7 bits can be collected from two parts: the first is several (at least one) most significant bits 074 // from the current byte, the second is several (maybe zero) least significant bits from the next byte. The special case is when the 075 // current bit is the first (least significant) bit in its byte (bit == 0): then the 7 needed bits are just the 7 least significant 076 // of the current byte. 077 for (int i = 0; i < resultLength - 1; i++) { 078 if (bit == 0) { 079 result[i] = (byte) (data[byteIndex] & 0x7f); 080 bit = 7; 081 continue; 082 } 083 084 int firstPart = (data[byteIndex] & 0xff) >>> bit; 085 int newBit = (bit + 7) & 7; 086 int secondPart = (data[++byteIndex] & ((1 << newBit) - 1)) << 8 - bit; 087 result[i] = (byte) (firstPart + secondPart); 088 bit = newBit; 089 } 090 091 // Write the last byte, which is just several most significant bits of the last byte in the input, padded with zeros 092 if (resultLength > 0) { 093 assert bit != 0 : "The last chunk cannot start from the input byte since otherwise at least one bit will remain unprocessed"; 094 assert byteIndex == data.length - 1 : "The last 7-bit chunk should be encoded from the last input byte: " + 095 byteIndex + " != " + (data.length - 1); 096 result[resultLength - 1] = (byte) ((data[byteIndex] & 0xff) >>> bit); 097 } 098 099 return result; 100 } 101 102 private static void addModuloByte(@NotNull byte[] data, int increment) { 103 for (int i = 0, n = data.length; i < n; i++) { 104 data[i] = (byte) ((data[i] + increment) & 0x7f); 105 } 106 } 107 108 // The maximum possible length of the byte array in the CONSTANT_Utf8_info structure in the bytecode, as per JVMS7 4.4.7 109 private static final int MAX_UTF8_INFO_LENGTH = 65535; 110 111 /** 112 * Converts a big byte array into the array of strings, where each string, when written to the constant pool table in bytecode, produces 113 * a byte array of not more than MAX_UTF8_INFO_LENGTH. Each byte, except those which are 0x0, occupies exactly one byte in the constant 114 * pool table. Zero bytes occupy two bytes in the table each. 115 * 116 * When strings are constructed from the array of bytes here, they are encoded in the platform's default encoding. This is fine: the 117 * conversion to the Modified UTF-8 (which here would be equivalent to replacing each 0x0 with 0xc0 0x80) will happen later by ASM, when 118 * it writes these strings to the bytecode 119 */ 120 @NotNull 121 private static String[] splitBytesToStringArray(@NotNull byte[] data) { 122 List<String> result = new ArrayList<String>(); 123 124 // The offset where the currently processed string starts 125 int off = 0; 126 127 // The effective length the bytes of the current string would occupy in the constant pool table 128 int len = 0; 129 130 for (int i = 0, n = data.length; i < n; i++) { 131 // When the effective length reaches at least MAX - 1, we add the current string to the result. Note that the effective length 132 // is at most MAX here: non-zero bytes occupy 1 byte and zero bytes occupy 2 bytes, so we couldn't jump over more than one byte 133 if (len >= MAX_UTF8_INFO_LENGTH - 1) { 134 assert len <= MAX_UTF8_INFO_LENGTH : "Produced strings cannot contain more than " + MAX_UTF8_INFO_LENGTH + " bytes: " + len; 135 result.add(new String(data, off, i - off)); 136 off = i; 137 len = 0; 138 } 139 140 if (data[i] == 0) { 141 len += 2; 142 } 143 else { 144 len++; 145 } 146 } 147 148 if (len >= 0) { 149 result.add(new String(data, off, data.length - off)); 150 } 151 152 return result.toArray(new String[result.size()]); 153 } 154 155 /** 156 * Converts encoded array of {@code String} obtained by {@link BitEncoding#encodeBytes(byte[])} back to a byte array. 157 */ 158 @NotNull 159 public static byte[] decodeBytes(@NotNull String[] data) { 160 byte[] bytes = combineStringArrayIntoBytes(data); 161 // Adding 0x7f modulo max byte value is equivalent to subtracting 1 the same modulo, which is inverse to what happens in encodeBytes 162 addModuloByte(bytes, 0x7f); 163 return decode7to8(bytes); 164 } 165 166 /** 167 * Combines the array of strings resulted from encodeBytes() into one long byte array 168 */ 169 @NotNull 170 private static byte[] combineStringArrayIntoBytes(@NotNull String[] data) { 171 int resultLength = 0; 172 for (String s : data) { 173 assert s.length() <= MAX_UTF8_INFO_LENGTH : "Too long string: " + s.length(); 174 resultLength += s.length(); 175 } 176 177 byte[] result = new byte[resultLength]; 178 int p = 0; 179 for (String s : data) { 180 for (int i = 0, n = s.length(); i < n; i++) { 181 result[p++] = (byte) s.charAt(i); 182 } 183 } 184 185 return result; 186 } 187 188 /** 189 * Decodes the byte array resulted from encode8to7(). 190 * 191 * Each byte of the input array has at most 7 valuable bits of information. So the decoding is equivalent to the following: least 192 * significant 7 bits of all input bytes are combined into one long bit string. This bit string is then split into groups of 8 bits, 193 * each of which forms a byte in the output. If there are any leftovers, they are ignored, since they were added just as a padding and 194 * do not comprise a full byte. 195 * 196 * Suppose the following encoded byte array is given (bits are numbered the same way as in encode8to7() doc): 197 * 198 * 01234567 01234567 01234567 01234567 199 * 200 * The output of the following form would be produced: 201 * 202 * 01234560 12345601 23456012 203 * 204 * Note how all most significant bits and leftovers are dropped, since they don't contain any useful information 205 */ 206 @NotNull 207 private static byte[] decode7to8(@NotNull byte[] data) { 208 // floor(7 * data.length / 8) 209 int resultLength = 7 * data.length / 8; 210 211 byte[] result = new byte[resultLength]; 212 213 // We maintain a pointer to an input bit in the same fashion as in encode8to7(): it's represented as two numbers: index of the 214 // current byte in the input and index of the bit in the byte 215 int byteIndex = 0; 216 int bit = 0; 217 218 // A resulting byte is comprised of 8 bits, starting from the current bit. Since each input byte only "contains 7 bytes", a 219 // resulting byte always consists of two parts: several most significant bits of the current byte and several least significant bits 220 // of the next byte 221 for (int i = 0; i < resultLength; i++) { 222 int firstPart = (data[byteIndex] & 0xff) >>> bit; 223 byteIndex++; 224 int secondPart = (data[byteIndex] & ((1 << (bit + 1)) - 1)) << 7 - bit; 225 result[i] = (byte) (firstPart + secondPart); 226 227 if (bit == 6) { 228 byteIndex++; 229 bit = 0; 230 } 231 else { 232 bit++; 233 } 234 } 235 236 return result; 237 } 238 }