1 : /* -*- mode: C -*-
2 : *
3 : * File: pdf-text-encoding.c
4 : * Date: Fri Jan 11 21:09:56 2008
5 : *
6 : * GNU PDF Library - Encoded Text handling utilities - Encoding
7 : *
8 : */
9 :
10 : /* Copyright (C) 2008 Free Software Foundation, Inc. */
11 :
12 : /* This program is free software: you can redistribute it and/or modify
13 : * it under the terms of the GNU General Public License as published by
14 : * the Free Software Foundation, either version 3 of the License, or
15 : * (at your option) any later version.
16 : *
17 : * This program is distributed in the hope that it will be useful,
18 : * but WITHOUT ANY WARRANTY; without even the implied warranty of
19 : * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
20 : * GNU General Public License for more details.
21 : *
22 : * You should have received a copy of the GNU General Public License
23 : * along with this program. If not, see <http://www.gnu.org/licenses/>.
24 : */
25 :
26 : #include <config.h>
27 :
28 : #include <string.h>
29 : #include <stdio.h>
30 :
31 : #include <pdf-text-encoding.h>
32 : #include <pdf-text-context.h>
33 :
34 :
35 : #define PDF_TEXT_CHANGE_ENDIANNESS_16BIT(number) \
36 : ((0x00FF & number) << 8) | ((0xFF00 & number) >> 8)
37 :
38 : #define PDF_TEXT_CHANGE_ENDIANNESS_32BIT(number) \
39 : (((0x000000FF & number) << 24) | \
40 : ((0x0000FF00 & number) << 8 ) | \
41 : ((0x00FF0000 & number) >> 8 ) | \
42 : ((0xFF000000 & number) >> 24))
43 :
44 :
45 : /* Mapping between PDF Doc Encoding and UNICODE UTF32 (Host Endian!)
46 : * Obtained from PDF Reference v1.7, appendix D.2 */
47 : #define PDFDOCENC_MAX 256
48 : static const pdf_u32_t pdfdocenc_map [PDFDOCENC_MAX] = { /* INDEXES */
49 : 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, /* 00, 07 */
50 : 0x0000, 0x0009, 0x000A, 0x0000, 0x0000, 0x000D, 0x0000, 0x0000, /* 08, 0F */
51 : 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, /* 10, 17 */
52 : 0x02D8, 0x02C7, 0x02C6, 0x02D9, 0x02DD, 0x02DB, 0x02DA, 0x02DC, /* 18, 1F */
53 : 0x0020, 0x0021, 0x0022, 0x0023, 0x0024, 0x0025, 0x0026, 0x0027, /* 20, 27 */
54 : 0x0028, 0x0029, 0x002A, 0x002B, 0x002C, 0x002D, 0x002E, 0x002F, /* 28, 2F */
55 : 0x0030, 0x0031, 0x0032, 0x0033, 0x0034, 0x0035, 0x0036, 0x0037, /* 30, 37 */
56 : 0x0038, 0x0039, 0x003A, 0x003B, 0x003C, 0x003D, 0x003E, 0x003F, /* 38, 3F */
57 : 0x0040, 0x0041, 0x0042, 0x0043, 0x0044, 0x0045, 0x0046, 0x0047, /* 40, 47 */
58 : 0x0048, 0x0049, 0x004A, 0x004B, 0x004C, 0x004D, 0x004E, 0x004F, /* 48, 4F */
59 : 0x0050, 0x0051, 0x0052, 0x0053, 0x0054, 0x0055, 0x0056, 0x0057, /* 50, 57 */
60 : 0x0058, 0x0059, 0x005A, 0x005B, 0x005C, 0x005D, 0x005E, 0x005F, /* 58, 5F */
61 : 0x0060, 0x0061, 0x0062, 0x0063, 0x0064, 0x0065, 0x0066, 0x0067, /* 60, 67 */
62 : 0x0068, 0x0069, 0x006A, 0x006B, 0x006C, 0x006D, 0x006E, 0x006F, /* 68, 6F */
63 : 0x0070, 0x0071, 0x0072, 0x0073, 0x0074, 0x0075, 0x0076, 0x0077, /* 70, 77 */
64 : 0x0078, 0x0079, 0x007A, 0x007B, 0x007C, 0x007D, 0x007E, 0x0000, /* 78, 7F */
65 : 0x2022, 0x2020, 0x2021, 0x2026, 0x2014, 0x2013, 0x0192, 0x2044, /* 80, 87 */
66 : 0x2039, 0x203A, 0x2212, 0x2030, 0x201E, 0x201C, 0x201D, 0x2018, /* 88, 8F */
67 : 0x2019, 0x201A, 0x2122, 0xFB01, 0xFB02, 0x0141, 0x0152, 0x0160, /* 90, 97 */
68 : 0x0178, 0x017D, 0x0131, 0x0142, 0x0153, 0x0161, 0x017E, 0x0000, /* 98, 9F */
69 : 0x20AC, 0x00A1, 0x00A2, 0x00A3, 0x00A4, 0x00A5, 0x00A6, 0x00A7, /* A0, A7 */
70 : 0x00A8, 0x00A9, 0x00AA, 0x00AB, 0x00AC, 0x0000, 0x00AE, 0x00AF, /* A8, AF */
71 : 0x00B0, 0x00B1, 0x00B2, 0x00B3, 0x00B4, 0x00B5, 0x00B6, 0x00B7, /* B0, B7 */
72 : 0x00B8, 0x00B9, 0x00BA, 0x00BB, 0x00BC, 0x00BD, 0x00BE, 0x00BF, /* B8, BF */
73 : 0x00C0, 0x00C1, 0x00C2, 0x00C3, 0x00C4, 0x00C5, 0x00C6, 0x00C7, /* C0, C7 */
74 : 0x00C8, 0x00C9, 0x00CA, 0x00CB, 0x00CC, 0x00CD, 0x00CE, 0x00CF, /* C8, CF */
75 : 0x00D0, 0x00D1, 0x00D2, 0x00D3, 0x00D4, 0x00D5, 0x00D6, 0x00D7, /* D0, D7 */
76 : 0x00D8, 0x00D9, 0x00DA, 0x00DB, 0x00DC, 0x00DD, 0x00DE, 0x00DF, /* D8, DF */
77 : 0x00E0, 0x00E1, 0x00E2, 0x00E3, 0x00E4, 0x00E5, 0x00E6, 0x00E7, /* E0, E7 */
78 : 0x00E8, 0x00E9, 0x00EA, 0x00EB, 0x00EC, 0x00ED, 0x00EE, 0x00EF, /* E8, EF */
79 : 0x00F0, 0x00F1, 0x00F2, 0x00F3, 0x00F4, 0x00F5, 0x00F6, 0x00F7, /* F0, F7 */
80 : 0x00F8, 0x00F9, 0x00FA, 0x00FB, 0x00FC, 0x00FD, 0x00FE, 0x00FF /* F8, FF */
81 : };
82 :
83 : /* Definition of the interval type */
84 : struct pdfdocenc_map_interval_s {
85 : pdf_u8_t interval_start;
86 : pdf_u8_t interval_stop;
87 : };
88 : typedef struct pdfdocenc_map_interval_s pdfdocenc_map_interval_t;
89 :
90 : /* Direct intervals */
91 : #define PDFDOCENC_MDI 10
92 : static const pdfdocenc_map_interval_t pdfdocenc_map_direct[PDFDOCENC_MDI] = {
93 : { 0x09, 0x0A },
94 : { 0x0D, 0x0D },
95 : { 0x20, 0x7E },
96 : { 0xA1, 0xAC },
97 : { 0xAE, 0xFF }
98 : };
99 :
100 : /* Indirect intervals */
101 : #define PDFDOCENC_MII 10
102 : static const pdfdocenc_map_interval_t pdfdocenc_map_indirect[PDFDOCENC_MII] = {
103 : { 0x18, 0x1F },
104 : { 0x80, 0x9E },
105 : { 0xA0, 0xA0 }
106 : };
107 :
108 : /* Undefined intervals, probably not really needed
109 : #define PDFDOCENC_MUI 10
110 : static const pdfdocenc_map_interval_t pdfdocenc_map_undefined[PDFDOCENC_MUI] = {
111 : { 0x00, 0x08 },
112 : { 0x0B, 0x0C },
113 : { 0x0E, 0x17 },
114 : { 0x7F, 0x7F },
115 : { 0x9F, 0x9F },
116 : { 0xAD, 0xAD }
117 : };
118 : */
119 :
120 :
121 : /* Mapping of the first char in a UTF-8 character representation, which
122 : * determines the number of bytes that will be needed to represent the
123 : * character:
124 : * 0xxx xxxx -> 1 byte [00,7F]
125 : * 110x xxxx -> 2 bytes [C0,DF]
126 : * 1110 xxxx -> 3 bytes [E0,EF]
127 : * 1111 0xxx -> 4 bytes [F0,F7]
128 : * Longer byte sequences are not allowed to represent Unicode points.
129 : */
130 : static const unsigned char n_bytes_in_utf8_char [256] = {
131 : 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, /* 00, 1F */
132 : 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, /* 20, 3F */
133 : 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, /* 40, 5F */
134 : 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, /* 60, 7F */
135 : 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, /* 80, 9F */
136 : 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, /* A0, BF */
137 : 2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2, /* C0, DF */
138 : 3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,4,4,4,4,4,4,4,4,0,0,0,0,0,0,0,0 /* E0, FF */
139 : };
140 :
141 :
142 : /* UNICODE BOM bytes encoded in the different built-in UNICODE encodings */
143 : static const pdf_text_bom_t unicode_bom [PDF_TEXT_MAX_UNICODE_ENC] = {
144 : { {239,187,191, 0}, 3 }, /* PDF_TEXT_UTF8 */
145 : { {254,255, 0, 0}, 2 }, /* PDF_TEXT_UTF16_BE */
146 : { {255,254, 0, 0}, 2 }, /* PDF_TEXT_UTF16_LE */
147 : { { 0, 0, 0, 0}, 0 }, /* N/A (UTF-16 HE) */
148 : { { 0, 0,254,255}, 4 }, /* PDF_TEXT_UTF32_BE */
149 : { {255,254, 0, 0}, 4 }, /* PDF_TEXT_UTF32_LE */
150 : { { 0, 0, 0, 0}, 0 } /* N/A (UTF-32HE) */
151 : };
152 :
153 :
154 : /******************** PDF Doc Encoding to UTF-32 conversion *******************/
155 :
156 : static pdf_text_utf32_char_t
157 : pdf_text_pdfdocenc_point_to_utf32he_point(const pdf_char_t pdfdocenc_val)
158 : {
159 : pdf_text_utf32_char_t utf32val;
160 1130 : utf32val.i = pdfdocenc_map[(int)pdfdocenc_val];
161 1130 : return utf32val;
162 : }
163 :
164 : /* Static function to convert from PDF Doc Encoding to UTF-32HE, lossless */
165 : pdf_status_t
166 : pdf_text_pdfdocenc_to_utf32he(const pdf_char_t *input_data,
167 : const pdf_size_t input_length,
168 : pdf_char_t **p_output_data,
169 : pdf_size_t *p_output_length)
170 177 : {
171 : /* Note: PDF Doc Encoding has always 8 bits per character.
172 : * This means that, if length of origin string is N bytes, the number of
173 : * required bytes for the UTF32 representation of the string is 4N.
174 : * (Each PDFDocEncoding byte is expanded to 4 bytes in UTF32. */
175 :
176 : pdf_size_t i; /* index for the origin string data */
177 : pdf_size_t j; /* index for the destination string data */
178 : pdf_char_t *data;
179 : pdf_size_t new_length;
180 :
181 : /* Get new string length... */
182 177 : new_length = 4 * input_length;
183 : /* Create destination string with correct size (but empty!) */
184 177 : data = (pdf_char_t *)pdf_alloc(new_length);
185 177 : if(data == NULL)
186 : {
187 0 : return PDF_ENOMEM;
188 : }
189 :
190 1295 : for(i = 0, j = 0; i < input_length; i++, j+=4)
191 : {
192 : pdf_text_utf32_char_t utf32val;
193 : /* Get value... */
194 2260 : utf32val = pdf_text_pdfdocenc_point_to_utf32he_point(input_data[i]);
195 1130 : if(utf32val.i == 0)
196 : {
197 : /* Oops, the given input byte is UNDEFINED in PDF Doc Encoding */
198 12 : pdf_dealloc(data);
199 12 : return PDF_EBADDATA;
200 : }
201 : /* Copy converted value to output */
202 1118 : memcpy(&(data[j]), &(utf32val), 4);
203 : }
204 :
205 : /* Everything went ok, set output data */
206 165 : *p_output_data = data;
207 165 : *p_output_length = new_length;
208 :
209 165 : return PDF_OK;
210 : }
211 :
212 :
213 : /******************** UTF-32 to PDF Doc Encoding conversion *******************/
214 :
215 : static pdf_char_t
216 : pdf_text_utf32he_point_to_pdfdocenc_point(const pdf_text_utf32_char_t utf32val)
217 : {
218 :
219 : pdf_u8_t i;
220 :
221 :
222 : /* If the given UTF-32 point is encoded in a single byte, then direct
223 : * conversion is possible */
224 805 : if(utf32val.i <= 0xFF)
225 : {
226 : /* Check if direct conversion is possible */
227 2409 : for(i=0; i<PDFDOCENC_MDI; ++i)
228 : {
229 2409 : if((utf32val.i <= pdfdocenc_map_direct[i].interval_stop) && \
230 : (utf32val.i >= pdfdocenc_map_direct[i].interval_start))
231 : {
232 : /* If the unicode char is among this intervals, direct conversion
233 : * is possible (single byte!) */
234 803 : return (pdf_char_t)utf32val.i;
235 : }
236 : }
237 : }
238 : else
239 : {
240 : /* Check if indirect conversion is possible */
241 22 : for(i=0; i<PDFDOCENC_MII; ++i)
242 : {
243 :
244 : /* Simple search in the interval */
245 20 : pdf_size_t search_index = pdfdocenc_map_indirect[i].interval_start;
246 114 : while((search_index <= pdfdocenc_map_indirect[i].interval_stop) )
247 : {
248 94 : if(pdfdocenc_map[search_index] == utf32val.i)
249 : {
250 : /* Directly apply search index as character */
251 0 : return (pdf_char_t) search_index;
252 : }
253 94 : search_index++;
254 : }
255 : }
256 : }
257 :
258 : /* If neither Direct conversion nor Indirect conversion are available,
259 : * the given character is UNDEFINED. Set default character when there is no
260 : * direct mapping to PDF Doc Encoding. This means that every conversion
261 : * to PDF Doc Encoding will NEVER fail if there is no mapping of a code point
262 : * in PDF Doc Encoding */
263 2 : return (pdf_char_t) '?';
264 : }
265 :
266 :
267 : /* Static function to convert from UTF-32HE to PDF Doc Encoding, with loss of
268 : information */
269 : pdf_status_t
270 : pdf_text_utf32he_to_pdfdocenc(const pdf_char_t *input_data,
271 : const pdf_size_t input_length,
272 : pdf_char_t **p_output_data,
273 : pdf_size_t *p_output_length)
274 119 : {
275 : /* Note: UTF-32 has always 32 bits per character.
276 : This means that, if length of origin string is 4N bytes, the number of
277 : required bytes for the PDFDocEncoding representation of the string is N.
278 : (Each UNICODE 4-byte character is represented as 1-byte character in
279 : PDFDocEncoding). This means that LOSS of information could happen */
280 :
281 : int i; /* index for the origin string data */
282 : int j; /* index for the destination string data */
283 :
284 : /* Check if the length of the origin string is multiple of 4 bytes */
285 119 : if(input_length % 4 != 0)
286 : {
287 : PDF_DEBUG_BASE("Input length must be multiple of 4! Invalid UTF-32 data."
288 : "(Length: %d)", (int)input_length);
289 0 : return PDF_EBADDATA;
290 : }
291 :
292 : /* Get new string length... */
293 119 : *p_output_length = input_length / 4;
294 :
295 : /* Create destination string with correct size (but empty!) */
296 119 : *p_output_data = (pdf_char_t *)pdf_alloc(*p_output_length);
297 119 : if(*p_output_data == NULL)
298 : {
299 0 : return PDF_ENOMEM;
300 : }
301 :
302 924 : for(i = 0, j = 0; i < input_length; i+=4, j++)
303 : {
304 : pdf_text_utf32_char_t utf32val; /* UNICODE char */
305 : /* Get UTF-32 char to convert */
306 805 : memcpy(&utf32val, &input_data[i], 4);
307 : /* Convert character to PDF Doc Encoding */
308 1610 : (*p_output_data)[j] = pdf_text_utf32he_point_to_pdfdocenc_point(utf32val);
309 : }
310 :
311 119 : return PDF_OK;
312 : }
313 :
314 :
315 : /*********************** UTF-32 to UTF-32 conversions *************************/
316 :
317 : /* Function to convert from UTF32-HE to UTF32-HE, lossless */
318 : pdf_status_t
319 : pdf_text_utf32he_to_utf32he(const pdf_char_t *input_data,
320 : const pdf_size_t input_length,
321 : const pdf_bool_t swap,
322 : const pdf_bool_t check_input_he,
323 : const pdf_bool_t check_output_he,
324 : pdf_char_t **p_output_data,
325 : pdf_size_t *p_output_length)
326 744 : {
327 : pdf_size_t walker;
328 744 : pdf_size_t bom_bytes = 0;
329 744 : pdf_char_t *new_data = NULL;
330 744 : pdf_size_t new_size = 0;
331 :
332 744 : if(input_length % 4 != 0)
333 : {
334 : /* Invalid number of bytes! */
335 : PDF_DEBUG_BASE("Input length must be multiple of 4! Invalid UTF-32 data."
336 : " (Length: %d)", (int)input_length);
337 45 : return PDF_EBADDATA;
338 : }
339 :
340 : /* Check if BOM is present... and skip it if so */
341 699 : if(pdf_text_check_unicode_bom (input_data, input_length,
342 : PDF_TEXT_UTF32_HE, swap))
343 : {
344 : /* Skip BOM */
345 30 : bom_bytes = 4;
346 : }
347 :
348 : /* Allocate memory */
349 699 : new_size = input_length - bom_bytes;
350 : /* Create destination string with correct size (but empty!) */
351 699 : new_data = (pdf_char_t *)pdf_alloc(new_size);
352 699 : if(new_data == NULL)
353 : {
354 0 : return PDF_ENOMEM;
355 : }
356 :
357 : /* Change endianness of each 32bit value... */
358 8795 : for(walker = bom_bytes; walker < input_length; walker+=4)
359 : {
360 : pdf_text_utf32_char_t utf32val;
361 8096 : memcpy(&utf32val, &input_data[walker], 4);
362 :
363 : /* Check code point validity (if the input is Host Endian) */
364 : /* Code point must not be a surrogate code unit, and must be in the
365 : * U+00000000 - U+0010FFFF range */
366 8096 : if(check_input_he)
367 : {
368 3368 : if((utf32val.i > 0x10FFFF) || \
369 : ((utf32val.i >= 0xD800) && \
370 : (utf32val.i <= 0xDFFF)))
371 : {
372 : /* Invalid UTF-32 code point received */
373 : PDF_DEBUG_BASE("Invalid input UTF-32HE code point: "
374 : "%.2X:%.2X:%.2X:%.2X",
375 : utf32val.c[0],
376 : utf32val.c[1],
377 : utf32val.c[2],
378 : utf32val.c[3]);
379 0 : return PDF_EBADTEXT;
380 : }
381 : }
382 :
383 : /* Swap bytes */
384 8096 : if(swap)
385 : {
386 4770 : utf32val.i = PDF_TEXT_CHANGE_ENDIANNESS_32BIT(utf32val.i);
387 : }
388 :
389 : /* Check code point validity (if the output is Host Endian) */
390 : /* Code point must not be a surrogate code unit, and must be in the
391 : * U+00000000 - U+0010FFFF range */
392 8096 : if(check_output_he)
393 : {
394 4728 : if((utf32val.i > 0x10FFFF) || \
395 : ((utf32val.i >= 0xD800) && \
396 : (utf32val.i <= 0xDFFF)))
397 : {
398 : /* Invalid UTF-32 code point received */
399 : PDF_DEBUG_BASE("Invalid output UTF-32HE code point: "
400 : "%.2X:%.2X:%.2X:%.2X",
401 : utf32val.c[0],
402 : utf32val.c[1],
403 : utf32val.c[2],
404 : utf32val.c[3]);
405 0 : return PDF_EBADTEXT;
406 : }
407 : }
408 :
409 : /* Copy value */
410 8096 : memcpy(&(new_data[walker-bom_bytes]), &utf32val, 4);
411 : }
412 :
413 : /* Really set output data */
414 699 : *p_output_data = new_data;
415 699 : *p_output_length = new_size;
416 :
417 699 : return PDF_OK;
418 : }
419 :
420 :
421 : /*********************** UTF-16 to UTF-32 conversions *************************/
422 :
423 : /* Static function to convert a given UTF-16HE character (with one or two words)
424 : * to UTF-32HE. The number of bytes (2 or 4) used from the input UTF-16BE point
425 : * is returned (or 0 if the UTF-16HE point is not valid */
426 : static pdf_size_t
427 : pdf_text_utf16he_point_to_utf32he_point(pdf_text_utf16_char_t utf16val[2],
428 : pdf_text_utf32_char_t *p_utf32val)
429 : {
430 : pdf_size_t n_bytes;
431 :
432 : /* Ok, so how can we know if the UTF16 character is encoded using 2 or
433 : * 4 bytes? A surrogate pair consists on two 16-bit values of the
434 : * UTF16 encoding. Each word (16bit-value) within the surrogate pair
435 : * doesn't represent a valid character, as it is enclosed in the
436 : * following interval: U+D800 - U+DFFF. This means that if the first
437 : * word analysed is outside this interval, it will be treated
438 : * separately. If the first word is within this interval, it is
439 : * expected to have the second word within the interval as well. If
440 : * this doesn't happen it will be treated as a badly formatted UTF16
441 : * string. In fact, there are two different intervals within the surrogate
442 : * points themselves: the High surrogate point will be in the U+D800 -
443 : * U+DBFF interval, and the Low surrogate point will be in the U+DC00 -
444 : * U+DFFF interval. */
445 :
446 1339 : if(((utf16val[0].i) >= 0xD800) && \
447 : ((utf16val[0].i) <= 0xDFFF))
448 : {
449 : /* To have a valid surrogate pair, the first UTF-16 value must be the High
450 : * surrogate code unit, and the second UTF-16 value must be the Low
451 : * surrogate code unit. */
452 40 : if(((utf16val[0].i) <= 0xDFFF) && \
453 : ((utf16val[1].i) >= 0xDC00) && \
454 : ((utf16val[1].i) <= 0xDFFF))
455 : {
456 : /* Yes, second word is within the validity interval, it seems a
457 : * correct 32-bit representation of a character in UTF16BE */
458 40 : n_bytes = 4;
459 40 : (*p_utf32val).i = 0x10000 + \
460 : (((utf16val[0].i) - 0xD800) << 10) + \
461 : ((utf16val[1].i) - 0xDC00);
462 : }
463 : /* else Oops, invalid UTF-16HE surrogate pair! Input data is not well
464 : * formed... */
465 : else
466 : {
467 : PDF_DEBUG_BASE("Invalid UTF-16HE point! %.2X:%.2X:%.2X:%.2X",
468 : utf16val[0].c[0], utf16val[0].c[1],
469 : utf16val[1].c[0], utf16val[1].c[1]);
470 0 : n_bytes = 0;
471 : }
472 : }
473 : else
474 : {
475 : /* No multiword representation, just 16bits for this character
476 : * So conversion is direct... */
477 1299 : n_bytes = 2;
478 1299 : (*p_utf32val).i = (utf16val[0]).i;
479 : }
480 1339 : return n_bytes;
481 : }
482 :
483 :
484 : /* Function to convert from UTF16-HE to UTF32-HE, lossless */
485 : pdf_status_t
486 : pdf_text_utf16he_to_utf32he(const pdf_char_t *input_data,
487 : const pdf_size_t input_length,
488 : const pdf_bool_t swap,
489 : pdf_char_t **p_output_data,
490 : pdf_size_t *p_output_length,
491 : pdf_char_t **p_remaining_data,
492 : pdf_size_t *p_remaining_length)
493 148 : {
494 : /* Note: UTF-16 has either 16 or 32 bits per character.
495 : * This means that, if length of origin string is N bytes, the number of
496 : * required bytes for the UTF-32 representation of the string is 2N in
497 : * the worst case (in the case of having all the UTF-16 characters encoded
498 : * with 16bits).
499 : * (Each UTF-16 is expanded to 4 bytes in UTF-32. */
500 :
501 : pdf_char_t *data;
502 : pdf_size_t new_string_length;
503 : pdf_size_t new_string_length_worst;
504 : pdf_size_t delta_in_utf16be;
505 : int i; /* index for the origin string data */
506 : int j; /* index for the destination string data */
507 : pdf_text_utf16_char_t utf16val[2];
508 : pdf_text_utf32_char_t utf32val;
509 148 : short stop_conversion = PDF_FALSE;
510 148 : short check_lang_code = PDF_FALSE;
511 148 : int bom_bytes = 0;
512 :
513 : /* Check if length is multiple of 2 (data must come in pairs of bytes!) */
514 148 : if((input_length < 2) || \
515 : (input_length % 2) != 0)
516 : {
517 : PDF_DEBUG_BASE("Input length must be multiple of 2 and greater than 2!"
518 : " Invalid UTF-16 data. (Length: %d)", (int)input_length);
519 45 : return PDF_EBADDATA;
520 : }
521 :
522 : /* Check if BOM is present... and skip it if so */
523 103 : if(pdf_text_check_unicode_bom (input_data, input_length,
524 : PDF_TEXT_UTF16_HE, swap))
525 : {
526 : /* Skip BOM */
527 35 : bom_bytes = 2;
528 : }
529 :
530 : /* Get new string worst length... (don't consider BOM bytes) */
531 103 : new_string_length_worst = 2 * (input_length - bom_bytes);
532 : /* Create destination string with worst size (but empty!) */
533 103 : data = (pdf_char_t *)pdf_alloc(new_string_length_worst);
534 103 : if(data == NULL)
535 : {
536 0 : return PDF_ENOMEM;
537 : }
538 :
539 : /* Initiate final string length */
540 103 : new_string_length = 0;
541 :
542 : /* Initiate indexes */
543 103 : i = bom_bytes; /* Skipping BOM if present... */
544 103 : j = 0;
545 :
546 : /* Check if specific country/language could be found */
547 103 : if((p_remaining_length != NULL) && \
548 : (p_remaining_data != NULL))
549 : {
550 33 : check_lang_code = PDF_TRUE;
551 : }
552 :
553 : /* This while loop will be done until the end of the input data OR until
554 : * the moment a new country/language code identifier is found. But, this
555 : * extra stop condition will only be available if valid `p_remaining_data'
556 : * and `p_remaining_length' pointers are given as input. */
557 1454 : while((i < input_length) && \
558 : (!stop_conversion))
559 : {
560 1363 : if((check_lang_code) && \
561 : (input_data[i+1] == PDF_TEXT_LCI_1) && \
562 : (input_data[i] == PDF_TEXT_LCI_0))
563 : {
564 : /* Stop conversion... due to new lang/code initializer */
565 12 : stop_conversion = PDF_TRUE;
566 : /* Set the output remaining data... */
567 12 : *p_remaining_length = input_length - i;
568 12 : *p_remaining_data = (pdf_char_t *) &input_data[i];
569 : }
570 : else
571 : {
572 : /* Store the UTF-16(BE/LE) data in the intermediate variable */
573 1339 : utf16val[0].c[0] = input_data[i];
574 1339 : utf16val[0].c[1] = input_data[i+1];
575 1339 : if((i+3) < input_length)
576 : {
577 1286 : utf16val[1].c[0] = input_data[i+2];
578 1286 : utf16val[1].c[1] = input_data[i+3];
579 : }
580 : /* else, last point should be only 1-word length */
581 : else
582 : {
583 53 : utf16val[1].c[0] = 0x00;
584 53 : utf16val[1].c[1] = 0x00;
585 : }
586 :
587 1339 : if(swap)
588 : {
589 : /* Input data must be swapped in order to convert it to
590 : * host endian */
591 835 : utf16val[0].i = PDF_TEXT_CHANGE_ENDIANNESS_16BIT(utf16val[0].i);
592 835 : utf16val[1].i = PDF_TEXT_CHANGE_ENDIANNESS_16BIT(utf16val[1].i);
593 : }
594 :
595 : /* Change UTF-16HE point to UTF-32HE point */
596 1339 : delta_in_utf16be = pdf_text_utf16he_point_to_utf32he_point(utf16val,
597 : &utf32val);
598 1339 : if(delta_in_utf16be == 0)
599 : {
600 : /* Oops, invalid UTF-16HE point found! */
601 0 : pdf_dealloc(data);
602 : PDF_DEBUG_BASE("Conversion from UTF-16 to UTF-32HE stopped");
603 0 : return PDF_EBADTEXT;
604 : }
605 :
606 : /* Finally, store the UTF-32 representation of the char in the output
607 : * string... */
608 1339 : data[j] = utf32val.c[0];
609 1339 : data[j+1] = utf32val.c[1];
610 1339 : data[j+2] = utf32val.c[2];
611 1339 : data[j+3] = utf32val.c[3];
612 :
613 : /* Update final string length after having added this character */
614 1339 : new_string_length+=4;
615 :
616 : /* Update indexes */
617 1339 : i += delta_in_utf16be;
618 1339 : j += 4;
619 : }
620 : }
621 :
622 : /* Everything went ok, set output data */
623 103 : *p_output_data = data;
624 : /* Set output length... */
625 103 : *p_output_length = new_string_length;
626 :
627 : /* Check if the stop flag was set due to finding lang/country code
628 : * initializer. If not found, set zero remaining length and NULL
629 : * remaining str */
630 103 : if((!stop_conversion) && \
631 : (p_remaining_length != NULL) && \
632 : (p_remaining_data != NULL))
633 : {
634 21 : *p_remaining_length = 0;
635 21 : *p_remaining_data = NULL;
636 : }
637 :
638 : /* Now, if the real output string length is not equal to the worst string
639 : * length, we will reallocate memory for the correct size. This will only
640 : * happen when at least one character is not encoded with 32bits in UTF-16. */
641 103 : if(new_string_length != new_string_length_worst)
642 : {
643 : /* Recreate object with correct size... */
644 50 : *p_output_data = (pdf_char_t *)pdf_realloc(*p_output_data,
645 : new_string_length);
646 50 : if(*p_output_data == NULL)
647 : {
648 0 : return PDF_ENOMEM;
649 : }
650 : }
651 103 : return PDF_OK;
652 : }
653 :
654 :
655 :
656 : /*********************** UTF-32 to UTF-16 conversions *************************/
657 :
658 :
659 : /* Static function to convert a given UTF-32HE character to UTF-16HE. The number
660 : * of bytes used in the output UTF-16HE point is returned (or 0
661 : * if the UTF-16HE point is not valid */
662 : static pdf_size_t
663 : pdf_text_utf32he_point_to_utf16he_point(pdf_text_utf32_char_t utf32val,
664 : pdf_text_utf16_char_t utf16val[2])
665 : {
666 : pdf_size_t n_bytes;
667 :
668 1008 : if((utf32val.i >= 0xD800) && \
669 : (utf32val.i <= 0xDFFF))
670 : {
671 : PDF_DEBUG_BASE("Invalid UTF-32HE point (surrogate pair found)! "
672 : "%.2X:%.2X:%.2X:%.2X",
673 : utf32val.c[0], utf32val.c[1],
674 : utf32val.c[2], utf32val.c[3]);
675 0 : n_bytes = 0;
676 : }
677 : /* Check if multiword (32bits) representation is needed */
678 1008 : else if( utf32val.i >= 0x10000 )
679 : {
680 : /* Ok so it seems a multiword representation...
681 : * Now check input UTF-32HE representation to see if it really is a
682 : * Unicode point (from 0x00000 to 0x10FFFF) */
683 32 : if (utf32val.i <= 0x10FFFF)
684 : {
685 : /* 32 bits are required for this char */
686 32 : n_bytes = 4;
687 32 : utf32val.i -= 0x10000;
688 : /* Process higher 10 bits, by shifting to the right 10 bits */
689 32 : (utf16val[0]).i = (utf32val.i >> 10) + 0xD800;
690 : /* Process lower 10 bits, by masking the value with 0x03FF */
691 32 : (utf16val[1]).i = (utf32val.i & 0x03FF) + 0xDC00;
692 : }
693 : else
694 : {
695 : /* else Oops, invalid 32-bit character! Input data is not well
696 : * formed... */
697 : PDF_DEBUG_BASE("Invalid UTF-32HE point! %.2X:%.2X:%.2X:%.2X",
698 : utf32val.c[0], utf32val.c[1],
699 : utf32val.c[2], utf32val.c[3]);
700 0 : n_bytes = 0;
701 : }
702 : }
703 : else
704 : {
705 976 : n_bytes = 2;
706 : /* No multiword representation, just 16bits for this character
707 : * So conversion is direct.... */
708 976 : (utf16val[0]).i = utf32val.i;
709 976 : (utf16val[1]).i = 0x0000;
710 : }
711 1008 : return n_bytes;
712 : }
713 :
714 :
715 : /* Function to convert from UTF-32HE to UTF-16, lossless */
716 : pdf_status_t
717 : pdf_text_utf32he_to_utf16he(const pdf_char_t *input_data,
718 : const pdf_size_t input_length,
719 : pdf_char_t **p_output_data,
720 : pdf_size_t *p_output_length,
721 : pdf_bool_t swap)
722 80 : {
723 : /* Note: UTF-16BE has either 16 or 32 bits per character.
724 : This means that, if length of origin string is 4N bytes, the number of
725 : required bytes for the UTF16BE representation of the string is 4N in
726 : the worst case. (When all the UTF16be representations have 32bits)
727 : */
728 : pdf_size_t new_string_length;
729 : pdf_size_t new_string_length_worst;
730 : pdf_size_t delta_in_utf16be;
731 : int i; /* index for the origin string data */
732 : int j; /* index for the destination string data */
733 : pdf_text_utf16_char_t utf16val[2];
734 : pdf_text_utf32_char_t utf32val;
735 : pdf_char_t *data;
736 :
737 : /* Get new string length (worst case)... */
738 80 : new_string_length_worst = input_length;
739 : /* Create destination string with correct size (but empty!) */
740 80 : data = (pdf_char_t *)pdf_alloc(new_string_length_worst);
741 80 : if(data == NULL)
742 : {
743 0 : return PDF_ENOMEM;
744 : }
745 :
746 : /* Initiate real string length, without considering marker bytes */
747 80 : new_string_length = 0;
748 :
749 1088 : for( i = 0, j = 0; i < input_length; i += 4, j += delta_in_utf16be )
750 : {
751 : /* Get UCS4 char, as a direct memory copy from the input array */
752 1008 : memcpy(&utf32val, &(input_data[i]), 4);
753 :
754 2016 : delta_in_utf16be = pdf_text_utf32he_point_to_utf16he_point(utf32val,
755 : utf16val);
756 :
757 1008 : if(delta_in_utf16be == 0)
758 : {
759 : /* Oops, invalid UTF-16HE point found! */
760 0 : pdf_dealloc(data);
761 : PDF_DEBUG_BASE("Conversion from UTF-32HE to UTF-16 stopped");
762 0 : return PDF_EBADTEXT;
763 : }
764 :
765 : /* Change endianness of each output word if required */
766 1008 : if(swap)
767 : {
768 : /* Change to BE */
769 756 : (utf16val[0]).i = PDF_TEXT_CHANGE_ENDIANNESS_16BIT((utf16val[0]).i);
770 756 : (utf16val[1]).i = PDF_TEXT_CHANGE_ENDIANNESS_16BIT((utf16val[1]).i);
771 : }
772 :
773 : /* Finally, store the UTF16BE representation of the char in the output
774 : * string... */
775 1008 : memcpy(&(data[j]), &utf16val[0], delta_in_utf16be);
776 : /* Update new string legth... */
777 1008 : new_string_length += delta_in_utf16be;
778 : }
779 :
780 : /* If everything went ok, set output data */
781 80 : *p_output_data = data;
782 : /* Set final output length of the generated string */
783 80 : *p_output_length = new_string_length;
784 :
785 : /* If the real required string length is not equal to the initial worst length
786 : * then update string with correct length. */
787 80 : if(new_string_length != new_string_length_worst)
788 : {
789 : /* Recreate object with correct smaller size... */
790 80 : *p_output_data = (pdf_char_t *)pdf_realloc(*p_output_data,
791 : new_string_length);
792 80 : if(*p_output_data == NULL)
793 : {
794 0 : return PDF_ENOMEM;
795 : }
796 : }
797 :
798 80 : return PDF_OK;
799 : }
800 :
801 :
802 : /************************ UTF-8 to UTF-32 conversions *************************/
803 :
804 : /* Static function to convert a given UTF-8 character to UTF-32HE. The number
805 : * of bytes used in the input UTF-8 point is returned (or 0 if the UTF-8 point
806 : * is not valid */
807 : static pdf_size_t
808 : pdf_text_utf8_point_to_utf32he_point(const pdf_text_utf8_char_t utf8val[4],
809 : const pdf_size_t n_bytes,
810 : pdf_text_utf32_char_t *p_utf32val)
811 : {
812 : int c; /* index for the utf-8 representation of every char */
813 :
814 : /* Check validity of the UTF-8 bytes:
815 : * - First byte can be neither 0xFF nor 0xFE
816 : * - The following bytes must be in the [80-BF] range! (10xxxxxx) */
817 1982 : for(c=0; c<n_bytes; c++)
818 : {
819 1027 : if(((c == 0) && ((utf8val[0] == 0xFF) || (utf8val[0] == 0xFE))) || \
820 : ((c != 0) && ((utf8val[c] < 0x80) || (utf8val[c] > 0xBF))))
821 : {
822 : PDF_DEBUG_BASE("Invalid UTF-8 character: %.2X:%.2X:%.2X:%.2X",
823 : (int)utf8val[0],
824 : ((n_bytes>1)?((int)utf8val[1]):0),
825 : ((n_bytes>2)?((int)utf8val[2]):0),
826 : ((n_bytes>3)?((int)utf8val[3]):0));
827 10 : return 0;
828 : }
829 : }
830 :
831 : /* Load all the bytes of the UTF-8 representation in the UTF-32HE var */
832 955 : switch(n_bytes)
833 : {
834 : case 1:
835 933 : (*p_utf32val).i = (utf8val[0] & 0x7F); /* 0111 1111 */
836 : break;
837 : case 2:
838 10 : (*p_utf32val).i = ((utf8val[0] & 0x1F) << 6) + /* 0001 1111 */
839 : (utf8val[1] & 0x3F); /* 0011 1111 */
840 : break;
841 : case 3:
842 4 : (*p_utf32val).i = ((utf8val[0] & 0x0F) << 12) + /* 0000 1111 */
843 : ((utf8val[1] & 0x3F) << 6) + /* 0011 1111 */
844 : (utf8val[2] & 0x3F); /* 0011 1111 */
845 : break;
846 : case 4:
847 8 : (*p_utf32val).i = ((utf8val[0] & 0x07) << 18) + /* 0000 1111 */
848 : ((utf8val[1] & 0x3F) << 12) + /* 0000 1111 */
849 : ((utf8val[2] & 0x3F) << 6) + /* 0011 1111 */
850 : (utf8val[3] & 0x3F); /* 0011 1111 */
851 : break;
852 : default:
853 : /* Should never happen! */
854 0 : return 0;
855 : }
856 :
857 955 : return n_bytes;
858 : }
859 :
860 : /* Function to convert from UTF-8 to UTF-32HE, lossless */
861 : pdf_status_t
862 : pdf_text_utf8_to_utf32he(const pdf_char_t *input_data,
863 : const pdf_size_t input_length,
864 : pdf_char_t **p_output_data,
865 : pdf_size_t *p_output_length)
866 129 : {
867 : /* Note: PDF Doc Encoding has always 8 bits per character.
868 : * This means that, if length of origin string is N bytes, the number of
869 : * required bytes for the UTF32 representation of the string is 4N.
870 : * (Each PDFDocEncoding byte is expanded to 4 bytes in UTF32. */
871 : pdf_size_t new_string_length;
872 : pdf_size_t new_string_length_worst;
873 : pdf_size_t bom_bytes;
874 : int i; /* index for the origin string data */
875 : int j; /* index for the destination string data */
876 : pdf_size_t delta_in_utf8;
877 :
878 : pdf_char_t *data;
879 :
880 : /* Check if BOM is present... and skip it if so */
881 129 : bom_bytes = 0;
882 129 : if(pdf_text_check_unicode_bom (input_data, input_length, PDF_TEXT_UTF8, 0))
883 : {
884 : /* Skip BOM in UTF-8 */
885 10 : bom_bytes = 3;
886 : }
887 :
888 : /* Get new string length... */
889 129 : new_string_length_worst = 4 * (input_length - bom_bytes);
890 :
891 : /* Create destination string with worst size (but empty!) */
892 129 : data = (pdf_char_t *)pdf_alloc(new_string_length_worst);
893 129 : if(data == NULL)
894 : {
895 0 : return PDF_ENOMEM;
896 : }
897 :
898 129 : new_string_length = 0;
899 1084 : for(i = bom_bytes, j = 0; i < input_length; i+=delta_in_utf8, j+=4)
900 : {
901 : pdf_text_utf32_char_t utf32val;
902 : pdf_text_utf8_char_t utf8val[4];
903 :
904 : /* Check number of bytes needed for the UTF-8 char */
905 985 : delta_in_utf8 = n_bytes_in_utf8_char[(int)input_data[i]];
906 :
907 : /* Check validity of first byte in UTF-8 */
908 : /* Check if the required bytes are outside the input data stream */
909 985 : if((delta_in_utf8 == 0) || \
910 : ((input_length - i) < delta_in_utf8))
911 : {
912 : PDF_DEBUG_BASE("Wrong UTF-8 data received (UTF-8 length: %d, "
913 : "Remaining length: %d", delta_in_utf8,
914 : (input_length - i));
915 20 : pdf_dealloc(data);
916 20 : return PDF_EBADDATA;
917 : }
918 :
919 : /* Store data in intermediate UTF-8 variable */
920 965 : memcpy(&utf8val[0], &input_data[i], delta_in_utf8);
921 :
922 965 : if(pdf_text_utf8_point_to_utf32he_point(utf8val,
923 : delta_in_utf8,
924 : &utf32val) == 0)
925 : {
926 : PDF_DEBUG_BASE("Problem decoding UTF-8 string");
927 10 : pdf_dealloc(data);
928 10 : return PDF_EBADDATA;
929 : }
930 :
931 : /* Copy converted value (in UTF-32HE) to output */
932 955 : memcpy(&(data[j]), &(utf32val), 4);
933 :
934 : /* Update new string length */
935 955 : new_string_length += 4;
936 : }
937 :
938 : /* If everything went ok, set output data */
939 99 : *p_output_data = data;
940 : /* Set final output length of the generated string */
941 99 : *p_output_length = new_string_length;
942 :
943 : /* If the real required string length is not equal to the initial worst length
944 : * then update string with correct length. */
945 99 : if(new_string_length != new_string_length_worst)
946 : {
947 : /* Recreate object with correct smaller size... */
948 14 : *p_output_data = (pdf_char_t *)pdf_realloc(*p_output_data,
949 : new_string_length);
950 14 : if(*p_output_data == NULL)
951 : {
952 0 : return PDF_ENOMEM;
953 : }
954 : }
955 :
956 99 : return PDF_OK;
957 : }
958 :
959 :
960 : /************************ UTF-32 to UTF-8 conversions *************************/
961 :
962 : /* Static function to convert a given UTF-32HE character to UTF-8. The number
963 : * of bytes used in the output UTF-8 point is returned (or 0 if the UTF-8 point
964 : * is not valid */
965 : static pdf_size_t
966 : pdf_text_utf32he_point_to_utf8_point(const pdf_text_utf32_char_t utf32val,
967 : pdf_text_utf8_char_t utf8val[4])
968 : {
969 : pdf_size_t n_bytes;
970 :
971 381 : if(utf32val.i < 0x80)
972 : {
973 : /* Output is 1 byte */
974 365 : n_bytes = 1;
975 365 : utf8val[0] = (pdf_text_utf8_char_t) utf32val.i;
976 : }
977 16 : else if(utf32val.i < 0x800)
978 : {
979 : /* Output is 2 bytes */
980 4 : n_bytes = 2;
981 : /* Get first byte, using upper 5 bits --> 110xxxxx */
982 4 : utf8val[0] = ((pdf_text_utf8_char_t) (utf32val.i >> 6)) | 0xC0;
983 : /* Get second byte, using lower 6 bits --> 10xxxxxx */
984 4 : utf8val[1] = ((pdf_text_utf8_char_t) (utf32val.i & 0x3F)) | 0x80;
985 : }
986 12 : else if(utf32val.i < 0x10000)
987 : {
988 : /* Output is 3 bytes */
989 4 : n_bytes = 3;
990 : /* Get first byte, using upper 4 bits --> 1110xxxx */
991 4 : utf8val[0] = ((pdf_text_utf8_char_t)(utf32val.i >> 12)) | 0xE0;
992 : /* Get second byte, using middle 6 bits --> 10xxxxxx */
993 4 : utf8val[1] = ((pdf_text_utf8_char_t)((utf32val.i >> 6) & 0x3F)) | 0x80;
994 : /* Get third byte, using lower 6 bits --> 10xxxxxx */
995 4 : utf8val[2] = ((pdf_text_utf8_char_t)(utf32val.i & 0x3F)) | 0x80;
996 : }
997 8 : else if(utf32val.i < 0x0010FFFF)
998 : {
999 : /* Output is 4 bytes */
1000 8 : n_bytes = 4;
1001 : /* Get first byte, using upper 3 bits --> 11110xxx */
1002 8 : utf8val[0] = ((pdf_text_utf8_char_t)(utf32val.i >> 18)) | 0xF0;
1003 : /* Get second byte, using upper-middle 6 bits --> 10xxxxxx */
1004 8 : utf8val[1] = (((pdf_text_utf8_char_t)(utf32val.i >> 12)) & 0x3F) | 0x80;
1005 : /* Get second byte, using lower-middle 6 bits --> 10xxxxxx */
1006 8 : utf8val[2] = (((pdf_text_utf8_char_t)(utf32val.i >> 6)) & 0x3F) | 0x80;
1007 : /* Get third byte, using lower 6 bits --> 10xxxxxx */
1008 8 : utf8val[3] = ((pdf_text_utf8_char_t)(utf32val.i & 0x3F)) | 0x80;
1009 : }
1010 : else
1011 : {
1012 : /* Invalid input UTF-32 val */
1013 : PDF_DEBUG_BASE("Wrong UTF-32BE value! '0x%.2X 0x%.2X 0x%.2X 0x%.2X'",
1014 : utf32val.c[0],utf32val.c[1],utf32val.c[2],utf32val.c[3]);
1015 0 : n_bytes = 0;
1016 : }
1017 381 : return n_bytes;
1018 : }
1019 :
1020 : /* Function to convert from UTF-32HE to UTF-8, lossless */
1021 : pdf_status_t
1022 : pdf_text_utf32he_to_utf8(const pdf_char_t *input_data,
1023 : const pdf_size_t input_length,
1024 : pdf_char_t **p_output_data,
1025 : pdf_size_t *p_output_length)
1026 23 : {
1027 : /* Note: UTF-8 has either 8, 16, 24 or 32 bits per character.
1028 : This means that, if length of origin string is 4N bytes, the number of
1029 : required bytes for the UTF-8 representation of the string is 4N in
1030 : the worst case. (When all the UTF-8 representations have 32bits)
1031 : */
1032 : pdf_size_t new_string_length;
1033 : pdf_size_t new_string_length_worst;
1034 : int i; /* index for the origin string data */
1035 : int j; /* index for the destination string data */
1036 : pdf_char_t *data;
1037 : pdf_size_t delta_in_utf8;
1038 :
1039 : /* Get new string length (worst case)... */
1040 23 : new_string_length_worst = input_length;
1041 : /* Create destination string with correct size (but empty!) */
1042 23 : data = (pdf_char_t *)pdf_alloc(new_string_length_worst);
1043 23 : if(data == NULL)
1044 : {
1045 0 : return PDF_ENOMEM;
1046 : }
1047 :
1048 : /* Initiate real string length, without considering marker bytes */
1049 23 : new_string_length = 0;
1050 :
1051 404 : for( i = 0, j = 0; i < input_length; i += 4, j += delta_in_utf8 )
1052 : {
1053 : pdf_text_utf32_char_t utf32val;
1054 : pdf_text_utf8_char_t utf8val[4];
1055 :
1056 : /* Get UTF-32 char, as a direct memory copy from the input array */
1057 381 : memcpy(&utf32val, &(input_data[i]), 4);
1058 :
1059 762 : delta_in_utf8 = pdf_text_utf32he_point_to_utf8_point(utf32val,utf8val);
1060 381 : if(delta_in_utf8 == 0)
1061 : {
1062 : PDF_DEBUG_BASE("Problem encoding UTF-8 string");
1063 0 : pdf_dealloc(data);
1064 0 : return PDF_EBADTEXT;
1065 : }
1066 :
1067 : /* Store UTF-8 val in output array */
1068 381 : memcpy(&data[j], &(utf8val[0]), delta_in_utf8);
1069 :
1070 : /* Update new_string_length, depending on the bytes used to represent
1071 : * this character in UTF-8 */
1072 381 : new_string_length += delta_in_utf8;
1073 : }
1074 :
1075 : /* If everything went ok, set output data */
1076 23 : *p_output_data = data;
1077 : /* Set final output length of the generated string */
1078 23 : *p_output_length = new_string_length;
1079 :
1080 : /* If the real required string length is not equal to the initial worst length
1081 : * then update string with correct length. */
1082 23 : if(new_string_length != new_string_length_worst)
1083 : {
1084 : /* Recreate object with correct smaller size... */
1085 19 : *p_output_data = (pdf_char_t *)pdf_realloc(*p_output_data,
1086 : new_string_length);
1087 19 : if(*p_output_data == NULL)
1088 : {
1089 0 : return PDF_ENOMEM;
1090 : }
1091 : }
1092 :
1093 23 : return PDF_OK;
1094 : }
1095 :
1096 :
1097 : /*************************** BOM-related functions ****************************/
1098 :
1099 :
1100 : inline pdf_text_bom_t
1101 : pdf_text_get_unicode_bom(enum pdf_text_unicode_encoding_e unicode_encoding)
1102 108 : {
1103 108 : return unicode_bom[unicode_encoding];
1104 : }
1105 :
1106 :
1107 : pdf_bool_t
1108 : pdf_text_check_unicode_bom (const pdf_char_t *data,
1109 : const pdf_size_t size,
1110 : enum pdf_text_unicode_encoding_e enc,
1111 : int swap)
1112 1023 : {
1113 1023 : switch(enc)
1114 : {
1115 : case PDF_TEXT_UTF16_HE:
1116 : case PDF_TEXT_UTF32_HE:
1117 : {
1118 802 : enc += ((PDF_IS_BIG_ENDIAN ^ swap) ? PDF_TEXT_HE_TO_BE:PDF_TEXT_HE_TO_LE);
1119 : }
1120 : case PDF_TEXT_UTF8:
1121 : case PDF_TEXT_UTF16_BE:
1122 : case PDF_TEXT_UTF16_LE:
1123 : case PDF_TEXT_UTF32_BE:
1124 : case PDF_TEXT_UTF32_LE:
1125 : {
1126 1023 : if((size >= unicode_bom[enc].bom_bytes) && \
1127 : (memcmp(data,unicode_bom[enc].bom_data,unicode_bom[enc].bom_bytes)==0))
1128 : {
1129 96 : return PDF_TRUE;
1130 : }
1131 : }
1132 : default:
1133 927 : return PDF_FALSE;
1134 : }
1135 : }
1136 :
1137 : /* End of pdf-text-encoding.c */
|