charset.c 7.0 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398
  1. // SPDX-License-Identifier: GPL-2.0+
  2. /*
  3. * charset conversion utils
  4. *
  5. * Copyright (c) 2017 Rob Clark
  6. */
  7. #include <common.h>
  8. #include <charset.h>
  9. #include <capitalization.h>
  10. #include <malloc.h>
  11. static struct capitalization_table capitalization_table[] =
  12. #ifdef CONFIG_EFI_UNICODE_CAPITALIZATION
  13. UNICODE_CAPITALIZATION_TABLE;
  14. #elif CONFIG_FAT_DEFAULT_CODEPAGE == 1250
  15. CP1250_CAPITALIZATION_TABLE;
  16. #else
  17. CP437_CAPITALIZATION_TABLE;
  18. #endif
  19. /**
  20. * get_code() - read Unicode code point from UTF-8 stream
  21. *
  22. * @read_u8: - stream reader
  23. * @src: - string buffer passed to stream reader, optional
  24. * Return: - Unicode code point
  25. */
  26. static int get_code(u8 (*read_u8)(void *data), void *data)
  27. {
  28. s32 ch = 0;
  29. ch = read_u8(data);
  30. if (!ch)
  31. return 0;
  32. if (ch >= 0xc2 && ch <= 0xf4) {
  33. int code = 0;
  34. if (ch >= 0xe0) {
  35. if (ch >= 0xf0) {
  36. /* 0xf0 - 0xf4 */
  37. ch &= 0x07;
  38. code = ch << 18;
  39. ch = read_u8(data);
  40. if (ch < 0x80 || ch > 0xbf)
  41. goto error;
  42. ch &= 0x3f;
  43. } else {
  44. /* 0xe0 - 0xef */
  45. ch &= 0x0f;
  46. }
  47. code += ch << 12;
  48. if ((code >= 0xD800 && code <= 0xDFFF) ||
  49. code >= 0x110000)
  50. goto error;
  51. ch = read_u8(data);
  52. if (ch < 0x80 || ch > 0xbf)
  53. goto error;
  54. }
  55. /* 0xc0 - 0xdf or continuation byte (0x80 - 0xbf) */
  56. ch &= 0x3f;
  57. code += ch << 6;
  58. ch = read_u8(data);
  59. if (ch < 0x80 || ch > 0xbf)
  60. goto error;
  61. ch &= 0x3f;
  62. ch += code;
  63. } else if (ch >= 0x80) {
  64. goto error;
  65. }
  66. return ch;
  67. error:
  68. return '?';
  69. }
  70. /**
  71. * read_string() - read byte from character string
  72. *
  73. * @data: - pointer to string
  74. * Return: - byte read
  75. *
  76. * The string pointer is incremented if it does not point to '\0'.
  77. */
  78. static u8 read_string(void *data)
  79. {
  80. const char **src = (const char **)data;
  81. u8 c;
  82. if (!src || !*src || !**src)
  83. return 0;
  84. c = **src;
  85. ++*src;
  86. return c;
  87. }
  88. /**
  89. * read_console() - read byte from console
  90. *
  91. * @src - not used, needed to match interface
  92. * Return: - byte read
  93. */
  94. static u8 read_console(void *data)
  95. {
  96. return getc();
  97. }
  98. int console_read_unicode(s32 *code)
  99. {
  100. if (!tstc()) {
  101. /* No input available */
  102. return 1;
  103. }
  104. /* Read Unicode code */
  105. *code = get_code(read_console, NULL);
  106. return 0;
  107. }
  108. s32 utf8_get(const char **src)
  109. {
  110. return get_code(read_string, src);
  111. }
  112. int utf8_put(s32 code, char **dst)
  113. {
  114. if (!dst || !*dst)
  115. return -1;
  116. if ((code >= 0xD800 && code <= 0xDFFF) || code >= 0x110000)
  117. return -1;
  118. if (code <= 0x007F) {
  119. **dst = code;
  120. } else {
  121. if (code <= 0x07FF) {
  122. **dst = code >> 6 | 0xC0;
  123. } else {
  124. if (code < 0x10000) {
  125. **dst = code >> 12 | 0xE0;
  126. } else {
  127. **dst = code >> 18 | 0xF0;
  128. ++*dst;
  129. **dst = (code >> 12 & 0x3F) | 0x80;
  130. }
  131. ++*dst;
  132. **dst = (code >> 6 & 0x3F) | 0x80;
  133. }
  134. ++*dst;
  135. **dst = (code & 0x3F) | 0x80;
  136. }
  137. ++*dst;
  138. return 0;
  139. }
  140. size_t utf8_utf16_strnlen(const char *src, size_t count)
  141. {
  142. size_t len = 0;
  143. for (; *src && count; --count) {
  144. s32 code = utf8_get(&src);
  145. if (!code)
  146. break;
  147. if (code < 0) {
  148. /* Reserve space for a replacement character */
  149. len += 1;
  150. } else if (code < 0x10000) {
  151. len += 1;
  152. } else {
  153. len += 2;
  154. }
  155. }
  156. return len;
  157. }
  158. int utf8_utf16_strncpy(u16 **dst, const char *src, size_t count)
  159. {
  160. if (!src || !dst || !*dst)
  161. return -1;
  162. for (; count && *src; --count) {
  163. s32 code = utf8_get(&src);
  164. if (code < 0)
  165. code = '?';
  166. utf16_put(code, dst);
  167. }
  168. **dst = 0;
  169. return 0;
  170. }
  171. s32 utf16_get(const u16 **src)
  172. {
  173. s32 code, code2;
  174. if (!src || !*src)
  175. return -1;
  176. if (!**src)
  177. return 0;
  178. code = **src;
  179. ++*src;
  180. if (code >= 0xDC00 && code <= 0xDFFF)
  181. return -1;
  182. if (code >= 0xD800 && code <= 0xDBFF) {
  183. if (!**src)
  184. return -1;
  185. code &= 0x3ff;
  186. code <<= 10;
  187. code += 0x10000;
  188. code2 = **src;
  189. ++*src;
  190. if (code2 <= 0xDC00 || code2 >= 0xDFFF)
  191. return -1;
  192. code2 &= 0x3ff;
  193. code += code2;
  194. }
  195. return code;
  196. }
  197. int utf16_put(s32 code, u16 **dst)
  198. {
  199. if (!dst || !*dst)
  200. return -1;
  201. if ((code >= 0xD800 && code <= 0xDFFF) || code >= 0x110000)
  202. return -1;
  203. if (code < 0x10000) {
  204. **dst = code;
  205. } else {
  206. code -= 0x10000;
  207. **dst = code >> 10 | 0xD800;
  208. ++*dst;
  209. **dst = (code & 0x3ff) | 0xDC00;
  210. }
  211. ++*dst;
  212. return 0;
  213. }
  214. size_t utf16_strnlen(const u16 *src, size_t count)
  215. {
  216. size_t len = 0;
  217. for (; *src && count; --count) {
  218. s32 code = utf16_get(&src);
  219. if (!code)
  220. break;
  221. /*
  222. * In case of an illegal sequence still reserve space for a
  223. * replacement character.
  224. */
  225. ++len;
  226. }
  227. return len;
  228. }
  229. size_t utf16_utf8_strnlen(const u16 *src, size_t count)
  230. {
  231. size_t len = 0;
  232. for (; *src && count; --count) {
  233. s32 code = utf16_get(&src);
  234. if (!code)
  235. break;
  236. if (code < 0)
  237. /* Reserve space for a replacement character */
  238. len += 1;
  239. else if (code < 0x80)
  240. len += 1;
  241. else if (code < 0x800)
  242. len += 2;
  243. else if (code < 0x10000)
  244. len += 3;
  245. else
  246. len += 4;
  247. }
  248. return len;
  249. }
  250. int utf16_utf8_strncpy(char **dst, const u16 *src, size_t count)
  251. {
  252. if (!src || !dst || !*dst)
  253. return -1;
  254. for (; count && *src; --count) {
  255. s32 code = utf16_get(&src);
  256. if (code < 0)
  257. code = '?';
  258. utf8_put(code, dst);
  259. }
  260. **dst = 0;
  261. return 0;
  262. }
  263. s32 utf_to_lower(const s32 code)
  264. {
  265. struct capitalization_table *pos = capitalization_table;
  266. s32 ret = code;
  267. if (code <= 0x7f) {
  268. if (code >= 'A' && code <= 'Z')
  269. ret += 0x20;
  270. return ret;
  271. }
  272. for (; pos->upper; ++pos) {
  273. if (pos->upper == code) {
  274. ret = pos->lower;
  275. break;
  276. }
  277. }
  278. return ret;
  279. }
  280. s32 utf_to_upper(const s32 code)
  281. {
  282. struct capitalization_table *pos = capitalization_table;
  283. s32 ret = code;
  284. if (code <= 0x7f) {
  285. if (code >= 'a' && code <= 'z')
  286. ret -= 0x20;
  287. return ret;
  288. }
  289. for (; pos->lower; ++pos) {
  290. if (pos->lower == code) {
  291. ret = pos->upper;
  292. break;
  293. }
  294. }
  295. return ret;
  296. }
  297. size_t u16_strlen(const u16 *in)
  298. {
  299. size_t i;
  300. for (i = 0; in[i]; i++);
  301. return i;
  302. }
  303. size_t u16_strnlen(const u16 *in, size_t count)
  304. {
  305. size_t i;
  306. for (i = 0; count-- && in[i]; i++);
  307. return i;
  308. }
  309. /* Convert UTF-16 to UTF-8. */
  310. uint8_t *utf16_to_utf8(uint8_t *dest, const uint16_t *src, size_t size)
  311. {
  312. uint32_t code_high = 0;
  313. while (size--) {
  314. uint32_t code = *src++;
  315. if (code_high) {
  316. if (code >= 0xDC00 && code <= 0xDFFF) {
  317. /* Surrogate pair. */
  318. code = ((code_high - 0xD800) << 10) + (code - 0xDC00) + 0x10000;
  319. *dest++ = (code >> 18) | 0xF0;
  320. *dest++ = ((code >> 12) & 0x3F) | 0x80;
  321. *dest++ = ((code >> 6) & 0x3F) | 0x80;
  322. *dest++ = (code & 0x3F) | 0x80;
  323. } else {
  324. /* Error... */
  325. *dest++ = '?';
  326. /* *src may be valid. Don't eat it. */
  327. src--;
  328. }
  329. code_high = 0;
  330. } else {
  331. if (code <= 0x007F) {
  332. *dest++ = code;
  333. } else if (code <= 0x07FF) {
  334. *dest++ = (code >> 6) | 0xC0;
  335. *dest++ = (code & 0x3F) | 0x80;
  336. } else if (code >= 0xD800 && code <= 0xDBFF) {
  337. code_high = code;
  338. continue;
  339. } else if (code >= 0xDC00 && code <= 0xDFFF) {
  340. /* Error... */
  341. *dest++ = '?';
  342. } else if (code < 0x10000) {
  343. *dest++ = (code >> 12) | 0xE0;
  344. *dest++ = ((code >> 6) & 0x3F) | 0x80;
  345. *dest++ = (code & 0x3F) | 0x80;
  346. } else {
  347. *dest++ = (code >> 18) | 0xF0;
  348. *dest++ = ((code >> 12) & 0x3F) | 0x80;
  349. *dest++ = ((code >> 6) & 0x3F) | 0x80;
  350. *dest++ = (code & 0x3F) | 0x80;
  351. }
  352. }
  353. }
  354. return dest;
  355. }