001    /*
002     * Copyright 2010-2015 JetBrains s.r.o.
003     *
004     * Licensed under the Apache License, Version 2.0 (the "License");
005     * you may not use this file except in compliance with the License.
006     * You may obtain a copy of the License at
007     *
008     * http://www.apache.org/licenses/LICENSE-2.0
009     *
010     * Unless required by applicable law or agreed to in writing, software
011     * distributed under the License is distributed on an "AS IS" BASIS,
012     * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
013     * See the License for the specific language governing permissions and
014     * limitations under the License.
015     */
016    
017    package org.jetbrains.kotlin.serialization.jvm;
018    
019    import org.jetbrains.annotations.NotNull;
020    
021    import java.util.ArrayList;
022    import java.util.List;
023    
024    public class BitEncoding {
025        private BitEncoding() {
026        }
027    
028        /**
029         * Converts a byte array of serialized data to an array of {@code String} satisfying JVM annotation value argument restrictions:
030         * <ol>
031         *     <li>Each string's length should be no more than 65535</li>
032         *     <li>UTF-8 representation of each string cannot contain bytes in the range 0xf0..0xff</li>
033         * </ol>
034         */
035        @NotNull
036        public static String[] encodeBytes(@NotNull byte[] data) {
037            byte[] bytes = encode8to7(data);
038            // Since 0x0 byte is encoded as two bytes in the Modified UTF-8 (0xc0 0x80) and zero is rather common to byte arrays, we increment
039            // every byte by one modulo max byte value, so that the less common value 0x7f will be represented as two bytes instead.
040            addModuloByte(bytes, 1);
041            return splitBytesToStringArray(bytes);
042        }
043    
044        /**
045         * Converts a byte array to another byte array, every element of which is in the range 0x0..0x7f.
046         *
047         * The conversion is equivalent to the following: input bytes are combined into one long bit string. This big string is then split into
048         * groups of 7 bits. Each resulting 7-bit chunk is then converted to a byte (with a leading bit = 0). The last chunk may have less than
049         * 7 bits, it's prepended with zeros to form a byte. The result is then the array of these bytes, each of which is obviously in the
050         * range 0x0..0x7f.
051         *
052         * Suppose the input of 4 bytes is given (bytes are listed from the beginning to the end, each byte from the least significant bit to
053         * the most significant bit, bits within each byte are numbered):
054         *
055         *     01234567 01234567 01234567 01234567
056         *
057         * The output for this kind of input will be of the following form ('#' represents a zero bit):
058         *
059         *     0123456# 7012345# 6701234# 5670123# 4567####
060         */
061        @NotNull
062        private static byte[] encode8to7(@NotNull byte[] data) {
063            // ceil(data.length * 8 / 7)
064            int resultLength = (data.length * 8 + 6) / 7;
065            byte[] result = new byte[resultLength];
066    
067            // We maintain a pointer to the bit in the input, which is represented by two numbers: index of the current byte in the input and
068            // the index of a bit inside this byte (0 is least significant, 7 is most significant)
069            int byteIndex = 0;
070            int bit = 0;
071    
072            // Write all resulting bytes except the last one. To do this we need to collect exactly 7 bits, starting from the current, into a
073            // byte. In almost all cases these 7 bits can be collected from two parts: the first is several (at least one) most significant bits
074            // from the current byte, the second is several (maybe zero) least significant bits from the next byte. The special case is when the
075            // current bit is the first (least significant) bit in its byte (bit == 0): then the 7 needed bits are just the 7 least significant
076            // of the current byte.
077            for (int i = 0; i < resultLength - 1; i++) {
078                if (bit == 0) {
079                    result[i] = (byte) (data[byteIndex] & 0x7f);
080                    bit = 7;
081                    continue;
082                }
083    
084                int firstPart = (data[byteIndex] & 0xff) >>> bit;
085                int newBit = (bit + 7) & 7;
086                int secondPart = (data[++byteIndex] & ((1 << newBit) - 1)) << 8 - bit;
087                result[i] = (byte) (firstPart + secondPart);
088                bit = newBit;
089            }
090    
091            // Write the last byte, which is just several most significant bits of the last byte in the input, padded with zeros
092            if (resultLength > 0) {
093                assert bit != 0 : "The last chunk cannot start from the input byte since otherwise at least one bit will remain unprocessed";
094                assert byteIndex == data.length - 1 : "The last 7-bit chunk should be encoded from the last input byte: " +
095                                                      byteIndex + " != " + (data.length - 1);
096                result[resultLength - 1] = (byte) ((data[byteIndex] & 0xff) >>> bit);
097            }
098    
099            return result;
100        }
101    
102        private static void addModuloByte(@NotNull byte[] data, int increment) {
103            for (int i = 0, n = data.length; i < n; i++) {
104                data[i] = (byte) ((data[i] + increment) & 0x7f);
105            }
106        }
107    
108        // The maximum possible length of the byte array in the CONSTANT_Utf8_info structure in the bytecode, as per JVMS7 4.4.7
109        private static final int MAX_UTF8_INFO_LENGTH = 65535;
110    
111        /**
112         * Converts a big byte array into the array of strings, where each string, when written to the constant pool table in bytecode, produces
113         * a byte array of not more than MAX_UTF8_INFO_LENGTH. Each byte, except those which are 0x0, occupies exactly one byte in the constant
114         * pool table. Zero bytes occupy two bytes in the table each.
115         *
116         * When strings are constructed from the array of bytes here, they are encoded in the platform's default encoding. This is fine: the
117         * conversion to the Modified UTF-8 (which here would be equivalent to replacing each 0x0 with 0xc0 0x80) will happen later by ASM, when
118         * it writes these strings to the bytecode
119         */
120        @NotNull
121        private static String[] splitBytesToStringArray(@NotNull byte[] data) {
122            List<String> result = new ArrayList<String>();
123    
124            // The offset where the currently processed string starts
125            int off = 0;
126    
127            // The effective length the bytes of the current string would occupy in the constant pool table
128            int len = 0;
129    
130            for (int i = 0, n = data.length; i < n; i++) {
131                // When the effective length reaches at least MAX - 1, we add the current string to the result. Note that the effective length
132                // is at most MAX here: non-zero bytes occupy 1 byte and zero bytes occupy 2 bytes, so we couldn't jump over more than one byte
133                if (len >= MAX_UTF8_INFO_LENGTH - 1) {
134                    assert len <= MAX_UTF8_INFO_LENGTH : "Produced strings cannot contain more than " + MAX_UTF8_INFO_LENGTH + " bytes: " + len;
135                    result.add(new String(data, off, i - off));
136                    off = i;
137                    len = 0;
138                }
139    
140                if (data[i] == 0) {
141                    len += 2;
142                }
143                else {
144                    len++;
145                }
146            }
147    
148            if (len >= 0) {
149                result.add(new String(data, off, data.length - off));
150            }
151    
152            return result.toArray(new String[result.size()]);
153        }
154    
155        /**
156         * Converts encoded array of {@code String} obtained by {@link BitEncoding#encodeBytes(byte[])} back to a byte array.
157         */
158        @NotNull
159        public static byte[] decodeBytes(@NotNull String[] data) {
160            byte[] bytes = combineStringArrayIntoBytes(data);
161            // Adding 0x7f modulo max byte value is equivalent to subtracting 1 the same modulo, which is inverse to what happens in encodeBytes
162            addModuloByte(bytes, 0x7f);
163            return decode7to8(bytes);
164        }
165    
166        /**
167         * Combines the array of strings resulted from encodeBytes() into one long byte array
168         */
169        @NotNull
170        private static byte[] combineStringArrayIntoBytes(@NotNull String[] data) {
171            int resultLength = 0;
172            for (String s : data) {
173                assert s.length() <= MAX_UTF8_INFO_LENGTH : "Too long string: " + s.length();
174                resultLength += s.length();
175            }
176    
177            byte[] result = new byte[resultLength];
178            int p = 0;
179            for (String s : data) {
180                for (int i = 0, n = s.length(); i < n; i++) {
181                    result[p++] = (byte) s.charAt(i);
182                }
183            }
184    
185            return result;
186        }
187    
188        /**
189         * Decodes the byte array resulted from encode8to7().
190         *
191         * Each byte of the input array has at most 7 valuable bits of information. So the decoding is equivalent to the following: least
192         * significant 7 bits of all input bytes are combined into one long bit string. This bit string is then split into groups of 8 bits,
193         * each of which forms a byte in the output. If there are any leftovers, they are ignored, since they were added just as a padding and
194         * do not comprise a full byte.
195         *
196         * Suppose the following encoded byte array is given (bits are numbered the same way as in encode8to7() doc):
197         *
198         *     01234567 01234567 01234567 01234567
199         *
200         * The output of the following form would be produced:
201         *
202         *     01234560 12345601 23456012
203         *
204         * Note how all most significant bits and leftovers are dropped, since they don't contain any useful information
205         */
206        @NotNull
207        private static byte[] decode7to8(@NotNull byte[] data) {
208            // floor(7 * data.length / 8)
209            int resultLength = 7 * data.length / 8;
210    
211            byte[] result = new byte[resultLength];
212    
213            // We maintain a pointer to an input bit in the same fashion as in encode8to7(): it's represented as two numbers: index of the
214            // current byte in the input and index of the bit in the byte
215            int byteIndex = 0;
216            int bit = 0;
217    
218            // A resulting byte is comprised of 8 bits, starting from the current bit. Since each input byte only "contains 7 bytes", a
219            // resulting byte always consists of two parts: several most significant bits of the current byte and several least significant bits
220            // of the next byte
221            for (int i = 0; i < resultLength; i++) {
222                int firstPart = (data[byteIndex] & 0xff) >>> bit;
223                byteIndex++;
224                int secondPart = (data[byteIndex] & ((1 << (bit + 1)) - 1)) << 7 - bit;
225                result[i] = (byte) (firstPart + secondPart);
226    
227                if (bit == 6) {
228                    byteIndex++;
229                    bit = 0;
230                }
231                else {
232                    bit++;
233                }
234            }
235    
236            return result;
237        }
238    }