diff --git a/src/java.base/share/native/libtinyiconv/iconv.cpp b/src/java.base/share/native/libtinyiconv/iconv.cpp new file mode 100644 index 000000000..7018b6ce2 --- /dev/null +++ b/src/java.base/share/native/libtinyiconv/iconv.cpp @@ -0,0 +1,438 @@ +/* + * Copyright (C) 2017 The Android Open Source Project + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS + * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE + * COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, + * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, + * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS + * OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED + * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, + * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT + * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +#ifdef __ANDROID__ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +__BEGIN_DECLS + +/* + * These return values are specified by POSIX for multibyte conversion + * functions. + */ + +#ifdef __cplusplus +#define __MB_ERR_ILLEGAL_SEQUENCE static_cast(-1) +#define __MB_ERR_INCOMPLETE_SEQUENCE static_cast(-2) +#else +#define __MB_ERR_ILLEGAL_SEQUENCE (size_t)(-1) +#define __MB_ERR_INCOMPLETE_SEQUENCE (size_t)(-2) +#endif // __cplusplus +#define __MB_IS_ERR(rv) (rv == __MB_ERR_ILLEGAL_SEQUENCE || \ + rv == __MB_ERR_INCOMPLETE_SEQUENCE) +static inline __wur size_t mbstate_bytes_so_far(const mbstate_t* ps) { + return + (ps->__seq[2] != 0) ? 3 : + (ps->__seq[1] != 0) ? 2 : + (ps->__seq[0] != 0) ? 1 : 0; +} +static inline void mbstate_set_byte(mbstate_t* ps, int i, char byte) { + ps->__seq[i] = (uint8_t)(byte); +} +static inline __wur uint8_t mbstate_get_byte(const mbstate_t* ps, int n) { + return ps->__seq[n]; +} +static inline __wur size_t mbstate_reset_and_return_illegal(int _errno, mbstate_t* ps) { + errno = _errno; +#ifdef __cplusplus + *(reinterpret_cast(ps->__seq)) = 0; +#else + *(uint32_t*)(ps->__seq) = 0; +#endif // __cplusplus + return __MB_ERR_ILLEGAL_SEQUENCE; +} +static inline __wur size_t mbstate_reset_and_return(int _return, mbstate_t* ps) { +#ifdef __cplusplus + *(reinterpret_cast(ps->__seq)) = 0; +#else + *(uint32_t*)(ps->__seq) = 0; +#endif // __cplusplus + return _return; +} + +#ifdef __cplusplus +# define INVALID_ICONV_T reinterpret_cast(-1) +#else // !__cplusplus +# define INVALID_ICONV_T (iconv_t)(-1) +#endif // __cplusplus + +// Ideally we'd use icu4c but the API mismatch seems too great. So we just offer something +// equivalent to (but slightly easier to use for runs of text than) . If you're +// here to add more encodings, consider working on finishing the icu4c NDK wrappers instead. + +#ifdef __cplusplus + enum Encoding +#else + typedef enum +#endif // __cplusplus +{ + US_ASCII, + UTF_8, + UTF_16_LE, + UTF_16_BE, + UTF_32_LE, + UTF_32_BE, + WCHAR_T, +#ifdef __cplusplus + }; +#else + } Encoding; +#endif // __cplusplus + +#ifdef __cplusplus + enum Mode +#else + typedef enum +#endif // __cplusplus +{ + ERROR, + IGNORE, + TRANSLIT, +#ifdef __cplusplus + }; +#else + } Mode; +#endif // __cplusplus + +// This matching is strange but true. +// See http://www.unicode.org/reports/tr22/#Charset_Alias_Matching. +static bool __match_encoding(const char* lhs, const char* rhs) { + while (*lhs && *rhs) { + // Skip non-alnum in lhs; "UTF-8", "UTF_8", "UTF8", "UTF 8" are all equivalent. + // Also implement the "delete each 0 that is not preceded by a digit" rule. + for (; *lhs; ++lhs) { + if (isalnum(*lhs) && (*lhs != '0' || !isdigit(*(lhs + 1)))) break; + } + // Case doesn't matter either. + if (tolower(*lhs) != tolower(*rhs)) break; + ++lhs; + ++rhs; + } + // As a special case we treat the GNU "//" extensions as end of string. + if ((*lhs == '\0' || strstr(lhs, "//") == lhs) && *rhs == '\0') return true; + return false; +} + +static bool __parse_encoding(const char* s, Encoding* encoding, Mode* mode) { + const char* suffix = strstr(s, "//"); + if (suffix) { + if (!mode) return false; + if (strcmp(suffix, "//IGNORE") == 0) { + *mode = IGNORE; + } else if (strcmp(suffix, "//TRANSLIT") == 0) { + *mode = TRANSLIT; + } else { + return false; + } + } + if (__match_encoding(s, "utf8")) { + *encoding = UTF_8; + } else if (__match_encoding(s, "ascii") || __match_encoding(s, "usascii")) { + *encoding = US_ASCII; + } else if (__match_encoding(s, "utf16le")) { + *encoding = UTF_16_LE; + } else if (__match_encoding(s, "utf16be")) { + *encoding = UTF_16_BE; + } else if (__match_encoding(s, "utf32le")) { + *encoding = UTF_32_LE; + } else if (__match_encoding(s, "utf32be")) { + *encoding = UTF_32_BE; + } else if (__match_encoding(s, "wchart")) { + *encoding = WCHAR_T; + } else { + return false; + } + return true; +} + +struct __iconv_t { + Encoding src_encoding; + Encoding dst_encoding; + Mode mode; +/* + __iconv_t() : mode(ERROR) { + } +*/ + int Convert(char** src_buf0, size_t* src_bytes_left0, char** dst_buf0, size_t* dst_bytes_left0) { + // Reset state. + wc = 0; + memset(&ps, 0, sizeof(ps)); + replacement_count = 0; + ignored = false; + src_buf = src_buf0; + src_bytes_left = src_bytes_left0; + dst_buf = dst_buf0; + dst_bytes_left = dst_bytes_left0; + while (*src_bytes_left > 0) { + if (!GetNext() || !Convert()) return -1; + } + return Done(); + } + private: + char32_t wc; + char buf[16]; + size_t src_bytes_used; + size_t dst_bytes_used; + mbstate_t ps; + size_t replacement_count; + bool ignored; + char** src_buf; + size_t* src_bytes_left; + char** dst_buf; + size_t* dst_bytes_left; + bool GetNext() { + errno = 0; + switch (src_encoding) { + case US_ASCII: + wc = **src_buf; + src_bytes_used = 1; + if (wc > 0x7f) errno = EILSEQ; + break; + case UTF_8: + src_bytes_used = mbrtoc32(&wc, *src_buf, *src_bytes_left, &ps); + if (src_bytes_used == __MB_ERR_ILLEGAL_SEQUENCE) { + break; // EILSEQ already set. + } else if (src_bytes_used == __MB_ERR_INCOMPLETE_SEQUENCE) { + errno = EINVAL; + return false; + } + break; + case UTF_16_BE: + case UTF_16_LE: { + if (*src_bytes_left < 2) { + errno = EINVAL; + return false; + } + bool swap = (src_encoding == UTF_16_BE); + wc = In16(*src_buf, swap); + // 0xd800-0xdbff: high surrogates + // 0xdc00-0xdfff: low surrogates + if (wc >= 0xd800 && wc <= 0xdfff) { + if (wc >= 0xdc00) { // Low surrogate before high surrogate. + errno = EILSEQ; + return false; + } + if (*src_bytes_left < 4) { + errno = EINVAL; + return false; + } + uint16_t hi = wc; + uint16_t lo = In16(*src_buf + 2, swap); + wc = 0x10000 + ((hi - 0xd800) << 10) + (lo - 0xdc00); + src_bytes_used = 4; + } + break; + } + case UTF_32_BE: + case UTF_32_LE: + case WCHAR_T: + if (*src_bytes_left < 4) { + errno = EINVAL; + return false; + } + wc = In32(*src_buf, (src_encoding == UTF_32_BE)); + break; + } + if (errno == EILSEQ) { + switch (mode) { + case ERROR: + return false; + case IGNORE: + *src_buf += src_bytes_used; + *src_bytes_left -= src_bytes_used; + ignored = true; + return GetNext(); + case TRANSLIT: + wc = '?'; + ++replacement_count; + return true; + } + } + return true; + } + + bool Convert() { + errno = 0; + switch (dst_encoding) { + case US_ASCII: + buf[0] = wc; + dst_bytes_used = 1; + if (wc > 0x7f) errno = EILSEQ; + break; + case UTF_8: + dst_bytes_used = c32rtomb(buf, wc, &ps); + if (dst_bytes_used == __MB_ERR_ILLEGAL_SEQUENCE) { + break; // EILSEQ already set. + } else if (dst_bytes_used == __MB_ERR_INCOMPLETE_SEQUENCE) { + errno = EINVAL; + return false; + } + break; + case UTF_16_BE: + case UTF_16_LE: { + bool swap = (dst_encoding == UTF_16_BE); + if (wc < 0x10000) { // BMP. + Out16(buf, wc, swap); + } else { // Supplementary plane; output surrogate pair. + wc -= 0x10000; + char16_t hi = 0xd800 | (wc >> 10); + char16_t lo = 0xdc00 | (wc & 0x3ff); + Out16(buf + 0, hi, swap); + Out16(buf + 2, lo, swap); + dst_bytes_used = 4; + } + } break; + case UTF_32_BE: + case UTF_32_LE: + case WCHAR_T: + Out32(wc, (dst_encoding == UTF_32_BE)); + break; + } + if (errno == EILSEQ) { + if (mode == IGNORE) { + *src_buf += src_bytes_used; + *src_bytes_left -= src_bytes_used; + ignored = true; + return true; + } else if (mode == TRANSLIT) { + wc = '?'; + ++replacement_count; + return Convert(); + } + return false; + } + return Emit(); + } + + uint16_t In16(const char* buf, bool swap) { +#ifdef __cplusplus + const uint8_t* src = reinterpret_cast(buf); +#else // !__cplusplus + const uint8_t* src = (const uint8_t*)(buf); +#endif // __cplusplus + uint16_t wc = (src[0]) | (src[1] << 8); + if (swap) wc = __swap16(wc); + src_bytes_used = 2; + return wc; + } + + uint32_t In32(const char* buf, bool swap) { +#ifdef __cplusplus + const uint8_t* src = reinterpret_cast(buf); +#else // !__cplusplus + const uint8_t* src = (const uint8_t*)(buf); +#endif // __cplusplus + uint32_t wc = (src[0]) | (src[1] << 8) | (src[2] << 16) | (src[3] << 24); + if (swap) wc = __swap32(wc); + src_bytes_used = 4; + return wc; + } + + void Out16(char* dst, char16_t ch, bool swap) { + if (swap) ch = __swap16(ch); + dst[0] = ch; + dst[1] = ch >> 8; + dst_bytes_used = 2; + } + + void Out32(char32_t ch, bool swap) { + if (swap) ch = __swap32(ch); + buf[0] = ch; + buf[1] = ch >> 8; + buf[2] = ch >> 16; + buf[3] = ch >> 24; + dst_bytes_used = 4; + } + + bool Emit() { + if (dst_bytes_used > *dst_bytes_left) { + errno = E2BIG; + return false; + } + memcpy(*dst_buf, buf, dst_bytes_used); + *src_buf += src_bytes_used; + *src_bytes_left -= src_bytes_used; + *dst_buf += dst_bytes_used; + *dst_bytes_left -= dst_bytes_used; + return true; + } + + int Done() { + if (mode == TRANSLIT) return replacement_count; + if (ignored) { + errno = EILSEQ; + return -1; + } + return 0; + } +}; + +iconv_t iconv_open(const char* __dst_encoding, const char* __src_encoding) { + iconv_t result = iconv_t(); + result->mode = ERROR; + if (!__parse_encoding(__src_encoding, &result->src_encoding, 0 /* nullptr */) || + !__parse_encoding(__dst_encoding, &result->dst_encoding, &result->mode)) { + free(result); + errno = EINVAL; + return INVALID_ICONV_T; + } + return result; +} + +size_t iconv(iconv_t __converter, + char** __src_buf, size_t* __src_bytes_left, + char** __dst_buf, size_t* __dst_bytes_left) { + if (__converter == INVALID_ICONV_T) { + errno = EBADF; + return -1; + } + return __converter->Convert(__src_buf, __src_bytes_left, __dst_buf, __dst_bytes_left); +} + +int iconv_close(iconv_t __converter) { + if (__converter == INVALID_ICONV_T) { + errno = EBADF; + return -1; + } + free(__converter); + return 0; +} + +__END_DECLS + +#endif // __ANDROID__