https://github.com/unicode-org/icu/commit/b8389b1186fe4b56ca91eadfa31886a3b4a195c6 --- icu/source/tools/icuexportdata/icuexportdata.cpp.orig +++ icu/source/tools/icuexportdata/icuexportdata.cpp @@ -755,9 +755,12 @@ std::vector nonRecursive32; LocalUMutableCPTriePointer nonRecursiveBuilder(umutablecptrie_open(0, 0, status)); + UBool uts46 = false; + if (uprv_strcmp(basename, "nfkd") == 0) { mainNormalizer = Normalizer2::getNFKDInstance(status); } else if (uprv_strcmp(basename, "uts46d") == 0) { + uts46 = true; mainNormalizer = Normalizer2::getInstance(nullptr, "uts46", UNORM2_COMPOSE, status); } else { mainNormalizer = nfdNormalizer; @@ -817,23 +820,38 @@ nfcNormalizer->normalize(dst, nfc, status); nonNfdOrRoundTrips = (src == nfc); } + if (uts46) { + // Work around https://unicode-org.atlassian.net/browse/ICU-22658 + // TODO: Remove the workaround after data corresponding to + // https://www.unicode.org/L2/L2024/24061.htm#179-C36 lands + // for Unicode 16. + switch (c) { + case 0x2F868: + dst.truncate(0); + dst.append(UChar32(0x36FC)); + break; + case 0x2F874: + dst.truncate(0); + dst.append(UChar32(0x5F53)); + break; + case 0x2F91F: + dst.truncate(0); + dst.append(UChar32(0x243AB)); + break; + case 0x2F95F: + dst.truncate(0); + dst.append(UChar32(0x7AEE)); + break; + case 0x2F9BF: + dst.truncate(0); + dst.append(UChar32(0x45D7)); + break; + } + } int32_t len = dst.toUTF32(utf32, DECOMPOSITION_BUFFER_SIZE, status); + if (!len || (len == 1 && utf32[0] == 0xFFFD && c != 0xFFFD)) { - // Characters that normalize to nothing or to U+FFFD (without the - // input being U+FFFD) in ICU4C's UTS 46 normalization normalize - // as in NFD in ICU4X's UTF 46 normalization in the interest - // of data size and ICU4X's normalizer being unable to handle - // normalizing to nothing. - // When UTS 46 is implemented on top of ICU4X, a preprocessing - // step is supposed to remove these characters before the - // normalization step. - if (uprv_strcmp(basename, "uts46d") != 0) { - status.set(U_INTERNAL_PROGRAM_ERROR); - handleError(status, basename); - } - nfdNormalizer->normalize(src, dst, status); - len = dst.toUTF32(utf32, DECOMPOSITION_BUFFER_SIZE, status); - if (!len || (len == 1 && utf32[0] == 0xFFFD && c != 0xFFFD)) { + if (!uts46) { status.set(U_INTERNAL_PROGRAM_ERROR); handleError(status, basename); } @@ -951,7 +969,13 @@ if (!nonNfdOrRoundTrips) { compositionPassthroughBound = c; } - if (len == 1 && utf32[0] <= 0xFFFF) { + if (!len) { + if (!uts46) { + status.set(U_INTERNAL_PROGRAM_ERROR); + handleError(status, basename); + } + pendingTrieInsertions.push_back({c, 0xFFFFFFFF, false}); + } else if (len == 1 && utf32[0] <= 0xFFFF) { if (startsWithBackwardCombiningStarter) { if (mainNormalizer == nfdNormalizer) { // Not supposed to happen in NFD