skip to Main Content

I want to generate text bytes in various charsets (encodings), such as ISO-8859-1, Big5, UTF-8, UTF-16, etc., mostly for testing purposes (i.e. to make sure my app script can correctly handle the provided bytes in these charsets), which is almost like:

new TextEncoder(mycharset).encode(mystring)

Unfortunately TextEncoder only supports conversion of a string into UTF-8 bytes, which is unlike TextDecoder, which supports conversion of bytes in various charset into a string.

The behavior of TextEncoder is defined by the spec and is unlikely going to be changed in the future… Any folk know why there is the inconsistent behavior in both classes? And the way to do the job? (without manually providing a conversion table/map for all the target charsets)

TO REVIEWER: This issue is NOT a duplicate of the related question, which asks why the parameter of TextEncoder does not take effect, and the answer says that TextEncoder does not take a parameter by the spec.

This questions specifically asks for:

  1. why the spec specified that TextEncoder not take a parameter, which is inconsistent with TextDecoder
  2. HOW to perform the conversion as this question asked

Both are not asked and explained in the related question and answer.

2

Answers


  1. Chosen as BEST ANSWER

    Thanks for @Kaiido's idea. After some tests I finally find a way to do it natively:

    /**
     * Encode a string into bytes in the specified charset.
     *
     * @param {string} str - the string to encode
     * @param {string} [charset=UTF-8] - the target charset to encode into
     * @param {*} [replacement] - the replacement char for a non-encodable char,
     *     which should be a valid ASCII char. Empty string to replace with
     *     nothing. Falsy to throw an error instead.
     * @return {Promise<Uint8Array>} The encoded bytes.
     */
    var encodeText = (() => {
      function escapeHtml(str) {
        const rv = [];
        for (let i = 0, I = str.length; i < I; i++) {
          const code = str.codePointAt(i);
          if (code > 0xFFFF) { i++; }
          rv.push(`&#${code};`);
        }
        return rv.join('');
      }
    
      function unescapeHtml(str, replacement) {
        return unescape(str).replace(/&#(?:(d+)|x([dA-Fa-f]+));/g, (_, dec, hex) => {
          if (hex) {
            return String.fromCharCode(parseInt(hex, 16));
          }
          if (typeof replacement === 'string') {
            return replacement;
          }
          throw parseInt(dec, 10);
        });
      }
    
      function byteStringToU8Array(bstr) {
        let n = bstr.length, u8ar = new Uint8Array(n);
        while (n--) { u8ar[n] = bstr.charCodeAt(n); }
        return u8ar;
      }
    
      async function encodeText(str, charset = "UTF-8", replacement = null) {
        // test if the charset is available
        try {
          new TextDecoder(charset);
        } catch (ex) {
          throw new RangeError(`Specified charset "${charset}" is not supported.`);
        }
    
        charset = charset.toLowerCase();
    
        // specially handle Unicode transformations
        // Available UTF names:
        // https://developer.mozilla.org/en-US/docs/Web/API/Encoding_API/Encodings
        if (['utf-8', 'utf8', 'unicode-1-1-utf-8'].includes(charset)) {
          return new TextEncoder().encode(str);
        } else if (['utf-16be', 'utf-16le', 'utf-16'].includes(charset)) {
          const littleEndian = !(charset === 'utf-16be');
          const u8ar = new Uint8Array(str.length * 2);
          const view = new DataView(u8ar.buffer);
          for (let i = 0, I = str.length; i < I; i++) {
            const code = str.charCodeAt(i);
            view.setUint16(i * 2, code, littleEndian);
          }
          return u8ar;
        }
    
        const frame = document.createElement("iframe");
        const markup = `<!DOCTYPE html><script data-text="${escapeHtml(str)}">
    function escapeHtml(str) {
      return str.replace(/[&#%]/g, m => escape("&#x" + m.charCodeAt(0).toString(16) + ";"));
    }
    const text = escapeHtml(document.currentScript.dataset.text);
    const a = document.createElement("a");
    a.href = "https://example.com/?" + text;
    parent.postMessage(a.search.slice(1), "*");
    </script>`;
        const blob = new Blob([markup], {type: `text/html;charset=${charset}`});
        frame.src = URL.createObjectURL(blob);
        document.body.append(frame);
        const aborter = new AbortController();
        let result = await new Promise((resolve) => {
          addEventListener("message", ({source, data}) => {
            if (source === frame.contentWindow) {
              aborter.abort();
              resolve(data);
            }
          }, {signal: aborter.signal});
        });
        frame.remove();
        try {
          result = unescapeHtml(result, replacement);
        } catch (code) {
          const _code = code.toString(16).toUpperCase();
          const idx = str.indexOf(String.fromCodePoint(code));
          throw new RangeError(`Unable to encode char U+${_code} at position ${idx}`);
        }
        return byteStringToU8Array(result);
      }
    
      return encodeText;
    })();
    
    // tests
    (async () => {
      var str = "中文𠀀";
      var charset = "big5";
      var bytes = await encodeText(str, charset, '?');
      console.log(charset, bytes);
    
      var str = "中文𠀀";
      var charset = "shift_jis";
      var bytes = await encodeText(str, charset, '');
      console.log(charset, bytes);
    
      var str = "中文𠀀";
      var charset = "utf-8";
      var bytes = await encodeText(str, charset);
      console.log(charset, bytes);
    
      var str = "中文𠀀";
      var charset = "utf-16be";
      var bytes = await encodeText(str, charset);
      console.log(charset, bytes);
    
      var str = "中文𠀀";
      var charset = "utf-16le";
      var bytes = await encodeText(str, charset);
      console.log(charset, bytes);
    
      // check special chars are passed safely
      var str = "&#123;<>%20";
      var charset = "big5";
      var bytes = await encodeText(str, charset);
      console.log(charset, bytes);
    
      // throw an error for a bad charset
      try {
        var str = "中文𠀀";
        var charset = "wtf";
        var bytes = await encodeText(str, charset);
      } catch (ex) {
        console.error(ex);
      }
    
      // throw an error if no replacement string
      try {
        var str = "中文𠀀";
        var charset = "big5";
        var bytes = await encodeText(str, charset);
      } catch (ex) {
        console.error(ex);
      }
    })();


  2. According to this issue the logic for keeping only UTF-8 is that all the various Web APIs only accept UTF-8 as input. So there should be no need to produce other encodings for the Web APIs. This makes the API a lot simpler. Also, there is a goal to make everything UTF-8, so not providing tools to produce non-UTF-8 encodings make sense for the Web API. However it’s clear this goal isn’t reached yet and thus it makes sense to have a decoder in the platform.

    As for how to perform such encodings, you can use the polyfill made Joshua Bell, which does have a NONSTANDARD_allowLegacyEncoding option.
    To use the polyfill on modern browsers, you’d need to nullify the browser’s TextEncoder though:

    <script>
    window.TextEncoder = window.TextDecoder = null;
    </script>
    <script src="PATH/TO/POLYFILL.js"></script>
    

    Here I copy a snippet from a previous answer of mine:

    (async () => {
      const text = `Some text with nice line endingsnand special characters like é and ü.`;
      const encoding = 'windows-1252'; // a.k.a ANSI
    
      const utf8_blob = new Blob([text], { endings: "native" });
      const utf_8_txt = await utf8_blob.text();
    
      const encoder = new TextEncoder(encoding, {
        NONSTANDARD_allowLegacyEncoding: true
      });
      const data = encoder.encode(utf_8_txt); // now `data` is an Uint8Array
      const encoded_as_ANSI = new Blob([data]);
    
      const read_as_ANSI = await readAsText(encoded_as_ANSI, encoding)
      console.log({ read_as_ANSI });
      const read_as_UTF8 = await encoded_as_ANSI.text();
      console.log({ read_as_UTF8 });
    })();
    
    function readAsText(blob, encoding) {
      return new Promise(res => {
        const reader = new FileReader();
        reader.onload = e => res(reader.result);
        reader.readAsText(blob, encoding);
      });
    }
    <script>window.TextEncoder = null;// force installation of the polyfill</script>
    <script src="https://cdn.jsdelivr.net/gh/inexorabletash/text-encoding/lib/encoding-indexes.js"></script>
    <script src="https://cdn.jsdelivr.net/gh/inexorabletash/text-encoding/lib/encoding.js"></script>
    Login or Signup to reply.
Please signup or login to give your own answer.
Back To Top
Search