/***************************************************************************** * dvb-text.h: ***************************************************************************** * Copyright (C) 2007-2011 VLC authors and VideoLAN * * This program is free software; you can redistribute it and/or modify it * under the terms of the GNU Lesser General Public License as published by * the Free Software Foundation; either version 2.1 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU Lesser General Public License for more details. * * You should have received a copy of the GNU Lesser General Public License * along with this program; if not, write to the Free Software Foundation, * Inc., 51 Franklin Street, Fifth Floor, Boston MA 02110-1301, USA. *****************************************************************************/ /** * Converts a DVB SI text item to UTF-8. * Refer to EN 800 486 annex A. * @return a heap-allocation nul-terminated UTF-8 string or NULL on error. */ static char *vlc_from_EIT (const void *buf, size_t length) { if (unlikely(length == 0)) return NULL; char encbuf[12]; const char *encoding = encbuf; const char *in = buf; size_t offset = 1; unsigned char c = *in; if (c >= 0x20) { offset = 0; encoding = "ISO_6937"; } else if ((1 << c) & 0x0EFE) /* 1-7, 9-11 -> ISO 8859-(c+4) */ { snprintf (encbuf, sizeof (encbuf), "ISO_8859-%u", 4u + c); } else switch (c) { case 0x10: /* two more bytes */ offset = 3; if (length < 3 || in[1] != 0x00) return NULL; c = in[2]; if ((1 << c) & 0xEFFE) /* 1-11, 13-15 -> ISO 8859-(c) */ snprintf (encbuf, sizeof (encbuf), "ISO_8859-%hhu", c); else return NULL; break; case 0x11: /* the BMP */ case 0x14: /* Big5 subset of the BMP */ encoding = "UCS-2BE"; break; case 0x12: /* DVB has no clue about Korean. KS X 1001 (a.k.a. KS C 5601) is a * character set, not a character encoding... So we assume EUC-KR. * It is an encoding of KS X 1001. In practice, I guess nobody uses * this in any real DVB system. */ encoding = "EUC-KR"; break; case 0x13: /* GB-2312-1980 */ encoding = "GB2312"; break; case 0x15: encoding = "UTF-8"; break; #if 0 case 0x1F: /* operator-specific(?) */ offset = 2; #endif default: return NULL; } in += offset; length -= offset; char *out = FromCharset (encoding, in, length); if (out == NULL) { /* Fallback... */ out = strndup (in, length); if (unlikely(out == NULL)) return NULL; EnsureUTF8 (out); } length = strlen(out); /* Convert control codes */ for (char *p = strchr (out, '\xC2'); p; p = strchr (p + 1, '\xC2')) { /* We have valid UTF-8, to 0xC2 is followed by a continuation byte. */ /* 0x80-0x85,0x88-0x89 are reserved. * 0x86-0x87 are identical to Unicode and Latin-1. * 0x8A is CR/LF. * 0x8B-0x9F are unspecified. */ if (p[1] == '\x8A') memcpy (p, "\r\n", 2); /* Strip character emphasis */ if (p[1] == '\x86' || p[1] == '\x87') { const size_t n = p - out; memmove (p, p+2, length - n); length -= 2; out[length] = '\0'; if (length == n) break; } } /* Private use area */ for (char *p = strchr (out, '\xEE'); p; p = strchr (p + 1, '\xEE')) { /* Within UTF-8, 0xEE is followed by a two continuation bytes. */ if (p[1] != '\x82') continue; if (p[2] == '\x8A') memcpy (p, "\r\r\n", 3); /* we need three bytes, so to CRs ;) */ /* Strip character emphasis */ if (p[2] == '\x86' || p[2] == '\x87') { const size_t n = p - out; memmove (p, p+3, length - n); length -= 3; out[length] = '\0'; if (length == n) break; } } return out; }