PercentCodec.java

  1. /*
  2.  * Licensed to the Apache Software Foundation (ASF) under one or more
  3.  * contributor license agreements.  See the NOTICE file distributed with
  4.  * this work for additional information regarding copyright ownership.
  5.  * The ASF licenses this file to You under the Apache License, Version 2.0
  6.  * (the "License"); you may not use this file except in compliance with
  7.  * the License.  You may obtain a copy of the License at
  8.  *
  9.  *      https://www.apache.org/licenses/LICENSE-2.0
  10.  *
  11.  * Unless required by applicable law or agreed to in writing, software
  12.  * distributed under the License is distributed on an "AS IS" BASIS,
  13.  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  14.  * See the License for the specific language governing permissions and
  15.  * limitations under the License.
  16.  */

  17. package org.apache.commons.codec.net;

  18. import java.nio.ByteBuffer;
  19. import java.util.BitSet;

  20. import org.apache.commons.codec.BinaryDecoder;
  21. import org.apache.commons.codec.BinaryEncoder;
  22. import org.apache.commons.codec.DecoderException;
  23. import org.apache.commons.codec.EncoderException;

  24. /**
  25.  * Implements the Percent-Encoding scheme, as described in HTTP 1.1 specification. For extensibility, an array of
  26.  * special US-ASCII characters can be specified in order to perform proper URI encoding for the different parts
  27.  * of the URI.
  28.  * <p>
  29.  * This class is immutable. It is also thread-safe besides using BitSet which is not thread-safe, but its public
  30.  * interface only call the access
  31.  * </p>
  32.  *
  33.  * @see <a href="https://tools.ietf.org/html/rfc3986#section-2.1">Percent-Encoding</a>
  34.  * @since 1.12
  35.  */
  36. public class PercentCodec implements BinaryEncoder, BinaryDecoder {

  37.     /**
  38.      * The escape character used by the Percent-Encoding in order to introduce an encoded character.
  39.      */
  40.     private static final byte ESCAPE_CHAR = '%';

  41.     /**
  42.      * The bit set used to store the character that should be always encoded
  43.      */
  44.     private final BitSet alwaysEncodeChars = new BitSet();

  45.     /**
  46.      * The flag defining if the space character should be encoded as '+'
  47.      */
  48.     private final boolean plusForSpace;

  49.     /**
  50.      * The minimum and maximum code of the bytes that is inserted in the bit set, used to prevent look-ups
  51.      */
  52.     private int alwaysEncodeCharsMin = Integer.MAX_VALUE, alwaysEncodeCharsMax = Integer.MIN_VALUE;

  53.     /**
  54.      * Constructs a Percent coded that will encode all the non US-ASCII characters using the Percent-Encoding
  55.      * while it will not encode all the US-ASCII characters, except for character '%' that is used as escape
  56.      * character for Percent-Encoding.
  57.      */
  58.     public PercentCodec() {
  59.         this.plusForSpace = false;
  60.         insertAlwaysEncodeChar(ESCAPE_CHAR);
  61.     }

  62.     /**
  63.      * Constructs a Percent codec by specifying the characters that belong to US-ASCII that should
  64.      * always be encoded. The rest US-ASCII characters will not be encoded, except for character '%' that
  65.      * is used as escape character for Percent-Encoding.
  66.      *
  67.      * @param alwaysEncodeChars the unsafe characters that should always be encoded
  68.      * @param plusForSpace      the flag defining if the space character should be encoded as '+'
  69.      */
  70.     public PercentCodec(final byte[] alwaysEncodeChars, final boolean plusForSpace) {
  71.         this.plusForSpace = plusForSpace;
  72.         insertAlwaysEncodeChars(alwaysEncodeChars);
  73.     }

  74.     private boolean canEncode(final byte c) {
  75.         return !isAsciiChar(c) || inAlwaysEncodeCharsRange(c) && alwaysEncodeChars.get(c);
  76.     }

  77.     private boolean containsSpace(final byte[] bytes) {
  78.         for (final byte b : bytes) {
  79.             if (b == ' ') {
  80.                 return true;
  81.             }
  82.         }
  83.         return false;
  84.     }

  85.     /**
  86.      * Decodes bytes encoded with Percent-Encoding based on RFC 3986. The reverse process is performed in order to
  87.      * decode the encoded characters to Unicode.
  88.      */
  89.     @Override
  90.     public byte[] decode(final byte[] bytes) throws DecoderException {
  91.         if (bytes == null) {
  92.             return null;
  93.         }
  94.         final ByteBuffer buffer = ByteBuffer.allocate(expectedDecodingBytes(bytes));
  95.         for (int i = 0; i < bytes.length; i++) {
  96.             final byte b = bytes[i];
  97.             if (b == ESCAPE_CHAR) {
  98.                 try {
  99.                     final int u = Utils.digit16(bytes[++i]);
  100.                     final int l = Utils.digit16(bytes[++i]);
  101.                     buffer.put((byte) ((u << 4) + l));
  102.                 } catch (final ArrayIndexOutOfBoundsException e) {
  103.                     throw new DecoderException("Invalid percent decoding: ", e);
  104.                 }
  105.             } else if (plusForSpace && b == '+') {
  106.                 buffer.put((byte) ' ');
  107.             } else {
  108.                 buffer.put(b);
  109.             }
  110.         }
  111.         return buffer.array();
  112.     }

  113.     /**
  114.      * Decodes a byte[] Object, whose bytes are encoded with Percent-Encoding.
  115.      *
  116.      * @param obj the object to decode
  117.      * @return the decoding result byte[] as Object
  118.      * @throws DecoderException if the object is not a byte array
  119.      */
  120.     @Override
  121.     public Object decode(final Object obj) throws DecoderException {
  122.         if (obj == null) {
  123.             return null;
  124.         }
  125.         if (obj instanceof byte[]) {
  126.             return decode((byte[]) obj);
  127.         }
  128.         throw new DecoderException("Objects of type " + obj.getClass().getName() + " cannot be Percent decoded");
  129.     }

  130.     private byte[] doEncode(final byte[] bytes, final int expectedLength, final boolean willEncode) {
  131.         final ByteBuffer buffer = ByteBuffer.allocate(expectedLength);
  132.         for (final byte b : bytes) {
  133.             if (willEncode && canEncode(b)) {
  134.                 byte bb = b;
  135.                 if (bb < 0) {
  136.                     bb = (byte) (256 + bb);
  137.                 }
  138.                 final char hex1 = Utils.hexDigit(bb >> 4);
  139.                 final char hex2 = Utils.hexDigit(bb);
  140.                 buffer.put(ESCAPE_CHAR);
  141.                 buffer.put((byte) hex1);
  142.                 buffer.put((byte) hex2);
  143.             } else if (plusForSpace && b == ' ') {
  144.                 buffer.put((byte) '+');
  145.             } else {
  146.                 buffer.put(b);
  147.             }
  148.         }
  149.         return buffer.array();
  150.     }

  151.     /**
  152.      * Percent-Encoding based on RFC 3986. The non US-ASCII characters are encoded, as well as the
  153.      * US-ASCII characters that are configured to be always encoded.
  154.      */
  155.     @Override
  156.     public byte[] encode(final byte[] bytes) throws EncoderException {
  157.         if (bytes == null) {
  158.             return null;
  159.         }
  160.         final int expectedEncodingBytes = expectedEncodingBytes(bytes);
  161.         final boolean willEncode = expectedEncodingBytes != bytes.length;
  162.         if (willEncode || plusForSpace && containsSpace(bytes)) {
  163.             return doEncode(bytes, expectedEncodingBytes, willEncode);
  164.         }
  165.         return bytes;
  166.     }

  167.     /**
  168.      * Encodes an object into using the Percent-Encoding. Only byte[] objects are accepted.
  169.      *
  170.      * @param obj the object to encode
  171.      * @return the encoding result byte[] as Object
  172.      * @throws EncoderException if the object is not a byte array
  173.      */
  174.     @Override
  175.     public Object encode(final Object obj) throws EncoderException {
  176.         if (obj == null) {
  177.             return null;
  178.         }
  179.         if (obj instanceof byte[]) {
  180.             return encode((byte[]) obj);
  181.         }
  182.         throw new EncoderException("Objects of type " + obj.getClass().getName() + " cannot be Percent encoded");
  183.     }

  184.     private int expectedDecodingBytes(final byte[] bytes) {
  185.         int byteCount = 0;
  186.         for (int i = 0; i < bytes.length;) {
  187.             final byte b = bytes[i];
  188.             i += b == ESCAPE_CHAR ? 3 : 1;
  189.             byteCount++;
  190.         }
  191.         return byteCount;
  192.     }

  193.     private int expectedEncodingBytes(final byte[] bytes) {
  194.         int byteCount = 0;
  195.         for (final byte b : bytes) {
  196.             byteCount += canEncode(b) ? 3 : 1;
  197.         }
  198.         return byteCount;
  199.     }

  200.     private boolean inAlwaysEncodeCharsRange(final byte c) {
  201.         return c >= alwaysEncodeCharsMin && c <= alwaysEncodeCharsMax;
  202.     }

  203.     /**
  204.      * Inserts a single character into a BitSet and maintains the min and max of the characters of the
  205.      * {@code BitSet alwaysEncodeChars} in order to avoid look-ups when a byte is out of this range.
  206.      *
  207.      * @param b the byte that is candidate for min and max limit
  208.      */
  209.     private void insertAlwaysEncodeChar(final byte b) {
  210.         if (b < 0) {
  211.             throw new IllegalArgumentException("byte must be >= 0");
  212.         }
  213.         this.alwaysEncodeChars.set(b);
  214.         if (b < alwaysEncodeCharsMin) {
  215.             alwaysEncodeCharsMin = b;
  216.         }
  217.         if (b > alwaysEncodeCharsMax) {
  218.             alwaysEncodeCharsMax = b;
  219.         }
  220.     }

  221.     /**
  222.      * Inserts the byte array into a BitSet for faster lookup.
  223.      *
  224.      * @param alwaysEncodeCharsArray
  225.      */
  226.     private void insertAlwaysEncodeChars(final byte[] alwaysEncodeCharsArray) {
  227.         if (alwaysEncodeCharsArray != null) {
  228.             for (final byte b : alwaysEncodeCharsArray) {
  229.                 insertAlwaysEncodeChar(b);
  230.             }
  231.         }
  232.         insertAlwaysEncodeChar(ESCAPE_CHAR);
  233.     }

  234.     private boolean isAsciiChar(final byte c) {
  235.         return c >= 0;
  236.     }
  237. }