charset.c 5.5 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310
  1. // SPDX-License-Identifier: GPL-2.0+
  2. /*
  3. * charset conversion utils
  4. *
  5. * Copyright (c) 2017 Rob Clark
  6. */
  7. #include <charset.h>
  8. #include <malloc.h>
  9. s32 utf8_get(const char **src)
  10. {
  11. s32 code = 0;
  12. unsigned char c;
  13. if (!src || !*src)
  14. return -1;
  15. if (!**src)
  16. return 0;
  17. c = **src;
  18. if (c >= 0x80) {
  19. ++*src;
  20. if (!**src)
  21. return -1;
  22. /*
  23. * We do not expect a continuation byte (0x80 - 0xbf).
  24. * 0x80 is coded as 0xc2 0x80, so we cannot have less then 0xc2
  25. * here.
  26. * The highest code point is 0x10ffff which is coded as
  27. * 0xf4 0x8f 0xbf 0xbf. So we cannot have a byte above 0xf4.
  28. */
  29. if (c < 0xc2 || code > 0xf4)
  30. return -1;
  31. if (c >= 0xe0) {
  32. if (c >= 0xf0) {
  33. /* 0xf0 - 0xf4 */
  34. c &= 0x07;
  35. code = c << 18;
  36. c = **src;
  37. ++*src;
  38. if (!**src)
  39. return -1;
  40. if (c < 0x80 || c > 0xbf)
  41. return -1;
  42. c &= 0x3f;
  43. } else {
  44. /* 0xe0 - 0xef */
  45. c &= 0x0f;
  46. }
  47. code += c << 12;
  48. if ((code >= 0xD800 && code <= 0xDFFF) ||
  49. code >= 0x110000)
  50. return -1;
  51. c = **src;
  52. ++*src;
  53. if (!**src)
  54. return -1;
  55. if (c < 0x80 || c > 0xbf)
  56. return -1;
  57. }
  58. /* 0xc0 - 0xdf or continuation byte (0x80 - 0xbf) */
  59. c &= 0x3f;
  60. code += c << 6;
  61. c = **src;
  62. if (c < 0x80 || c > 0xbf)
  63. return -1;
  64. c &= 0x3f;
  65. }
  66. code += c;
  67. ++*src;
  68. return code;
  69. }
  70. int utf8_put(s32 code, char **dst)
  71. {
  72. if (!dst || !*dst)
  73. return -1;
  74. if ((code >= 0xD800 && code <= 0xDFFF) || code >= 0x110000)
  75. return -1;
  76. if (code <= 0x007F) {
  77. **dst = code;
  78. } else {
  79. if (code <= 0x07FF) {
  80. **dst = code >> 6 | 0xC0;
  81. } else {
  82. if (code < 0x10000) {
  83. **dst = code >> 12 | 0xE0;
  84. } else {
  85. **dst = code >> 18 | 0xF0;
  86. ++*dst;
  87. **dst = (code >> 12 & 0x3F) | 0x80;
  88. }
  89. ++*dst;
  90. **dst = (code >> 6 & 0x3F) | 0x80;
  91. }
  92. ++*dst;
  93. **dst = (code & 0x3F) | 0x80;
  94. }
  95. ++*dst;
  96. return 0;
  97. }
  98. size_t utf8_utf16_strnlen(const char *src, size_t count)
  99. {
  100. size_t len = 0;
  101. for (; *src && count; --count) {
  102. s32 code = utf8_get(&src);
  103. if (!code)
  104. break;
  105. if (code < 0) {
  106. /* Reserve space for a replacement character */
  107. len += 1;
  108. } else if (code < 0x10000) {
  109. len += 1;
  110. } else {
  111. len += 2;
  112. }
  113. }
  114. return len;
  115. }
  116. int utf8_utf16_strncpy(u16 **dst, const char *src, size_t count)
  117. {
  118. if (!src || !dst || !*dst)
  119. return -1;
  120. for (; count && *src; --count) {
  121. s32 code = utf8_get(&src);
  122. if (code < 0)
  123. code = '?';
  124. utf16_put(code, dst);
  125. }
  126. **dst = 0;
  127. return 0;
  128. }
  129. s32 utf16_get(const u16 **src)
  130. {
  131. s32 code, code2;
  132. if (!src || !*src)
  133. return -1;
  134. if (!**src)
  135. return 0;
  136. code = **src;
  137. ++*src;
  138. if (code >= 0xDC00 && code <= 0xDFFF)
  139. return -1;
  140. if (code >= 0xD800 && code <= 0xDBFF) {
  141. if (!**src)
  142. return -1;
  143. code &= 0x3ff;
  144. code <<= 10;
  145. code += 0x10000;
  146. code2 = **src;
  147. ++*src;
  148. if (code2 <= 0xDC00 || code2 >= 0xDFFF)
  149. return -1;
  150. code2 &= 0x3ff;
  151. code += code2;
  152. }
  153. return code;
  154. }
  155. int utf16_put(s32 code, u16 **dst)
  156. {
  157. if (!dst || !*dst)
  158. return -1;
  159. if ((code >= 0xD800 && code <= 0xDFFF) || code >= 0x110000)
  160. return -1;
  161. if (code < 0x10000) {
  162. **dst = code;
  163. } else {
  164. code -= 0x10000;
  165. **dst = code >> 10 | 0xD800;
  166. ++*dst;
  167. **dst = (code & 0x3ff) | 0xDC00;
  168. }
  169. ++*dst;
  170. return 0;
  171. }
  172. size_t utf16_strnlen(const u16 *src, size_t count)
  173. {
  174. size_t len = 0;
  175. for (; *src && count; --count) {
  176. s32 code = utf16_get(&src);
  177. if (!code)
  178. break;
  179. /*
  180. * In case of an illegal sequence still reserve space for a
  181. * replacement character.
  182. */
  183. ++len;
  184. }
  185. return len;
  186. }
  187. size_t utf16_utf8_strnlen(const u16 *src, size_t count)
  188. {
  189. size_t len = 0;
  190. for (; *src && count; --count) {
  191. s32 code = utf16_get(&src);
  192. if (!code)
  193. break;
  194. if (code < 0)
  195. /* Reserve space for a replacement character */
  196. len += 1;
  197. else if (code < 0x80)
  198. len += 1;
  199. else if (code < 0x800)
  200. len += 2;
  201. else if (code < 0x10000)
  202. len += 3;
  203. else
  204. len += 4;
  205. }
  206. return len;
  207. }
  208. int utf16_utf8_strncpy(char **dst, const u16 *src, size_t count)
  209. {
  210. if (!src || !dst || !*dst)
  211. return -1;
  212. for (; count && *src; --count) {
  213. s32 code = utf16_get(&src);
  214. if (code < 0)
  215. code = '?';
  216. utf8_put(code, dst);
  217. }
  218. **dst = 0;
  219. return 0;
  220. }
  221. size_t u16_strlen(const u16 *in)
  222. {
  223. size_t i;
  224. for (i = 0; in[i]; i++);
  225. return i;
  226. }
  227. size_t u16_strnlen(const u16 *in, size_t count)
  228. {
  229. size_t i;
  230. for (i = 0; count-- && in[i]; i++);
  231. return i;
  232. }
  233. /* Convert UTF-16 to UTF-8. */
  234. uint8_t *utf16_to_utf8(uint8_t *dest, const uint16_t *src, size_t size)
  235. {
  236. uint32_t code_high = 0;
  237. while (size--) {
  238. uint32_t code = *src++;
  239. if (code_high) {
  240. if (code >= 0xDC00 && code <= 0xDFFF) {
  241. /* Surrogate pair. */
  242. code = ((code_high - 0xD800) << 10) + (code - 0xDC00) + 0x10000;
  243. *dest++ = (code >> 18) | 0xF0;
  244. *dest++ = ((code >> 12) & 0x3F) | 0x80;
  245. *dest++ = ((code >> 6) & 0x3F) | 0x80;
  246. *dest++ = (code & 0x3F) | 0x80;
  247. } else {
  248. /* Error... */
  249. *dest++ = '?';
  250. /* *src may be valid. Don't eat it. */
  251. src--;
  252. }
  253. code_high = 0;
  254. } else {
  255. if (code <= 0x007F) {
  256. *dest++ = code;
  257. } else if (code <= 0x07FF) {
  258. *dest++ = (code >> 6) | 0xC0;
  259. *dest++ = (code & 0x3F) | 0x80;
  260. } else if (code >= 0xD800 && code <= 0xDBFF) {
  261. code_high = code;
  262. continue;
  263. } else if (code >= 0xDC00 && code <= 0xDFFF) {
  264. /* Error... */
  265. *dest++ = '?';
  266. } else if (code < 0x10000) {
  267. *dest++ = (code >> 12) | 0xE0;
  268. *dest++ = ((code >> 6) & 0x3F) | 0x80;
  269. *dest++ = (code & 0x3F) | 0x80;
  270. } else {
  271. *dest++ = (code >> 18) | 0xF0;
  272. *dest++ = ((code >> 12) & 0x3F) | 0x80;
  273. *dest++ = ((code >> 6) & 0x3F) | 0x80;
  274. *dest++ = (code & 0x3F) | 0x80;
  275. }
  276. }
  277. }
  278. return dest;
  279. }