1 : /* -*- mode: C -*-
2 : *
3 : * File: pdf-text.c
4 : * Date: Fri Jan 11 21:09:56 2008
5 : *
6 : * GNU PDF Library - Encoded Text handling utilities
7 : *
8 : */
9 :
10 : /* Copyright (C) 2008 Free Software Foundation, Inc. */
11 :
12 : /* This program is free software: you can redistribute it and/or modify
13 : * it under the terms of the GNU General Public License as published by
14 : * the Free Software Foundation, either version 3 of the License, or
15 : * (at your option) any later version.
16 : *
17 : * This program is distributed in the hope that it will be useful,
18 : * but WITHOUT ANY WARRANTY; without even the implied warranty of
19 : * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
20 : * GNU General Public License for more details.
21 : *
22 : * You should have received a copy of the GNU General Public License
23 : * along with this program. If not, see <http://www.gnu.org/licenses/>.
24 : */
25 :
26 : #include <config.h>
27 :
28 : #include <limits.h>
29 : #include <stdlib.h>
30 : #include <string.h>
31 : #include <stdint.h>
32 : #ifdef HAVE_MALLOC_H
33 : #include <malloc.h>
34 : #endif /* HAVE_MALLOC_H */
35 : #include <xalloc.h>
36 : #include <math.h>
37 :
38 : #include <pdf-text.h>
39 : #include <pdf-text-encoding.h>
40 : #include <pdf-text-host-encoding.h>
41 : #include <pdf-text-context.h>
42 : #include <pdf-text-filter.h>
43 : #include <pdf-text-ucd.h>
44 :
45 :
46 : /* Lang/Country Minimum Length, in bytes, of the Lang/Country information within
47 : * a UTF16BEstring (2bytes for the first marker, 2 bytes for LANG and 2 bytes
48 : * for the last marker). */
49 : #define PDF_TEXT_LCMINL 6
50 :
51 : /* Maximum size, in bytes, of the Lang/Country information within a UTF16BE
52 : * string (Minimum size + 2 bytes for COUNTRY). */
53 : #define PDF_TEXT_LCMAXL 8
54 :
55 :
56 : /* Longest header length when requesting a unicode string with options is that
57 : * of UTF-16BE with BOM and lang/country information: 2bytes-BOM +
58 : * 8bytes-lang/country = 10 bytes (+ 1 NUL byte) */
59 : #define PDF_TEXT_USHMAXL 11
60 :
61 : /* ---------------- Static (private) functions prototypes ------------------- */
62 :
63 :
64 : /* This function receives as input a valid pdf_text_t element, where the
65 : * language and country code informations will be stored. In addition to this,
66 : * the function receives as input the data string (starting in the first
67 : * marker), and stores a pointer to the continuation of the data string, after
68 : * having read the language/country information. This function really assumes
69 : * that the input data string contains in the first bytes the country/lang
70 : * information.
71 : * Two options are possible:
72 : * XXllXX (6 bytes, XX is the marker, ll the language)
73 : * XXllccXX (8 bytes, XX is the marker, ll the language and cc the country)
74 : */
75 : static pdf_status_t
76 : pdf_text_get_lang_from_utf16be(pdf_text_t element,
77 : pdf_char_t **str_out,
78 : pdf_size_t *str_out_length,
79 : const pdf_char_t *str_in,
80 : const pdf_size_t str_in_length);
81 :
82 : /* Function to get the header of a unicode string as requested in the
83 : * `options' field when calling `pdf_text_get_unicode'. The header can be:
84 : * - BOM
85 : * - BOM + Lang/Country info (only if UTF-16BE requested)
86 : * - Lang/Country info (only if UTF-16BE requested)
87 : */
88 : static pdf_status_t
89 : pdf_text_get_unicode_string_header(pdf_char_t header[PDF_TEXT_USHMAXL],
90 : pdf_size_t *header_length,
91 : const enum pdf_text_unicode_encoding_e enc,
92 : const pdf_u32_t options,
93 : const pdf_char_t *language,
94 : const pdf_char_t *country);
95 :
96 : /* Function to convert a given Unicode Host Endian enumeration to the `real'
97 : * endianness (BE or LE). If a non-HE enumeration is passed to the function,
98 : * it will return the same enumeration value unchanged */
99 : static enum pdf_text_unicode_encoding_e
100 : pdf_text_transform_he_to_unicode_encoding(enum pdf_text_unicode_encoding_e enc);
101 :
102 : /* Function to compare two given words */
103 : static pdf_i32_t
104 : pdf_text_compare_words(const pdf_char_t *word1,
105 : const pdf_size_t size1,
106 : const pdf_char_t *word2,
107 : const pdf_size_t size2,
108 : const pdf_char_t *language1,
109 : const pdf_char_t *language2,
110 : pdf_status_t *p_ret_code);
111 :
112 : /* Non-Case sensitive comparison of text objects */
113 : static pdf_i32_t
114 : pdf_text_cmp_non_case_sensitive(pdf_text_t text1,
115 : pdf_text_t text2,
116 : pdf_status_t *p_ret_code);
117 :
118 : /* Clean (destroy and create empty) Word Boundaries list */
119 : static pdf_status_t
120 : pdf_text_clean_word_boundaries_list(pdf_list_t *p_word_boundaries);
121 : /* Fill in the Word Boundaries list using the given data */
122 : static pdf_status_t
123 : pdf_text_fill_word_boundaries_list(pdf_list_t word_boundaries,
124 : const pdf_char_t *data,
125 : const pdf_size_t size);
126 :
127 :
128 : /* ----------------------------- Public functions ----------------------------*/
129 :
130 :
131 :
132 : pdf_status_t
133 : pdf_text_init(void)
134 735 : {
135 : /* Initiate Text module context */
136 735 : return pdf_text_context_init();
137 : }
138 :
139 :
140 : pdf_status_t
141 : pdf_text_new (pdf_text_t *text)
142 941 : {
143 : /* The text global state should be initialized! */
144 941 : if (pdf_text_context_initialized () == PDF_FALSE)
145 : {
146 1 : return PDF_EBADCONTEXT;
147 : }
148 :
149 : /* Allocate memory for the new text structure */
150 940 : *text = (pdf_text_t) pdf_alloc (sizeof(struct pdf_text_s));
151 940 : if (*text == NULL)
152 : {
153 : /* Out of memory condition */
154 0 : return PDF_ENOMEM;
155 : }
156 :
157 : /* Initialize all contents */
158 940 : (*text)->data = NULL;
159 940 : (*text)->size = 0;
160 940 : (*text)->printable = NULL;
161 940 : (*text)->modified = PDF_FALSE;
162 940 : memset(&((*text)->lang[0]), 0, PDF_TEXT_CCL);
163 940 : memset(&((*text)->country[0]), 0, PDF_TEXT_CCL);
164 :
165 : /* Create empty word boundaries list */
166 940 : if(pdf_text_create_word_boundaries_list(&((*text)->word_boundaries)) != \
167 : PDF_OK)
168 : {
169 0 : pdf_dealloc(*text);
170 0 : *text = NULL;
171 : }
172 :
173 : /* Success! */
174 940 : return PDF_OK;
175 : }
176 :
177 :
178 : pdf_status_t
179 : pdf_text_destroy (pdf_text_t text)
180 764 : {
181 : /* Dealloc memory */
182 764 : if(text->data != NULL)
183 : {
184 664 : pdf_dealloc(text->data);
185 664 : text->data = NULL;
186 : }
187 :
188 764 : if (text->printable != NULL)
189 : {
190 0 : pdf_dealloc (text->printable);
191 : }
192 :
193 : /* Destroy word boundaries list */
194 764 : pdf_text_destroy_word_boundaries_list(&text->word_boundaries);
195 :
196 : /* Finally, clear full structure */
197 764 : pdf_dealloc(text);
198 :
199 764 : return PDF_OK;
200 : }
201 :
202 :
203 : pdf_text_t
204 : pdf_text_dup (const pdf_text_t text)
205 31 : {
206 : pdf_text_t element;
207 :
208 31 : if (text == NULL)
209 : {
210 1 : return NULL;
211 : }
212 :
213 : /* Allocate and initialize element */
214 30 : if (pdf_text_new (&element) == PDF_OK)
215 : {
216 : /* Duplicate size */
217 30 : element->size = text->size;
218 :
219 : /* Duplicate contents (if size > 0) */
220 30 : if(element->size > 0)
221 : {
222 29 : element->data = (pdf_char_t *) pdf_alloc (element->size);
223 29 : if(element->data != NULL)
224 : {
225 29 : memcpy(element->data, text->data, (size_t)element->size);
226 : }
227 : }
228 :
229 : /* Duplicate Language code and Country code (if available) */
230 30 : memcpy(element->lang, text->lang, (size_t) PDF_TEXT_CCL);
231 30 : memcpy(element->country, text->country, (size_t) PDF_TEXT_CCL);
232 :
233 : /* We don't really need to duplicate the contents of the word
234 : * boundaries list, as it is a side product, same with printable */
235 :
236 : /* Set output element...*/
237 30 : return element;
238 : }
239 : else
240 : {
241 : /* Dup failed */
242 0 : return NULL;
243 : }
244 : }
245 :
246 :
247 :
248 : pdf_status_t
249 : pdf_text_new_from_host (const pdf_char_t *str,
250 : const pdf_size_t size,
251 : const pdf_text_host_encoding_t enc,
252 : pdf_text_t *text)
253 7 : {
254 7 : pdf_text_t element = NULL;
255 7 : pdf_status_t ret_code = PDF_ETEXTENC;
256 : pdf_status_t ret_code_new;
257 :
258 7 : if((str == NULL) || \
259 : (size == 0))
260 : {
261 2 : return PDF_EBADDATA;
262 : }
263 :
264 : /* Allocate and initialize element */
265 5 : ret_code_new = pdf_text_new (&element);
266 5 : if (ret_code_new != PDF_OK)
267 : {
268 : /* Oops, element creation failed due to an error... */
269 0 : return ret_code_new;
270 : }
271 :
272 : /* Set Host Encoding contents */
273 5 : ret_code = pdf_text_set_host(element, str, size, enc);
274 :
275 5 : if(ret_code == PDF_OK)
276 : {
277 : /* Perfect! Set output variable */
278 3 : *text = element;
279 : }
280 : else
281 : {
282 : /* Conversion went wrong... so destroy object contents */
283 2 : pdf_text_destroy(element);
284 : }
285 :
286 : /* Return status of the conversion */
287 5 : return ret_code;
288 : }
289 :
290 :
291 :
292 : pdf_status_t
293 : pdf_text_new_from_pdf_string (const pdf_char_t *str,
294 : const pdf_size_t size,
295 : pdf_char_t **remaining_str,
296 : pdf_size_t *remaining_length,
297 : pdf_text_t *text)
298 97 : {
299 97 : pdf_status_t ret_code = PDF_ETEXTENC;
300 : pdf_status_t ret_code_new;
301 97 : pdf_text_t element = NULL;
302 97 : short bom_found = 0;
303 97 : short lang_found = 0;
304 :
305 97 : if(str == NULL)
306 : {
307 0 : return PDF_EBADDATA;
308 : }
309 :
310 : /* Allocate and initialize element */
311 97 : ret_code_new = pdf_text_new (&element);
312 97 : if (ret_code_new != PDF_OK)
313 : {
314 : /* Oops, element creation failed due to some error... */
315 0 : return ret_code_new;
316 : }
317 :
318 : /* First of all, check first two bytes to detect UTF-16BE BOM or lang/country
319 : * code initializer.
320 : * If length of the text is less than 2, then we can assume it is encoded in
321 : * PDF Doc Encoding */
322 97 : if(size >= 2)
323 : {
324 : /* Check Unicode Byte Order Marker encoded in UTF-16BE */
325 92 : if(pdf_text_check_unicode_bom(str, size, PDF_TEXT_UTF16_BE, 0))
326 : {
327 21 : bom_found = 1;
328 : /* Check Lang/Country Code initializer */
329 21 : if((size >= 4) && \
330 : (str[3] == PDF_TEXT_LCI_1) && \
331 : (str[2] == PDF_TEXT_LCI_0))
332 : {
333 16 : lang_found = 1;
334 : }
335 : }
336 : /* Check Lang/Country Code initializer (if this is the nth call to the
337 : * function parsing a single UTF-16BE string.*/
338 71 : else if((str[1] == PDF_TEXT_LCI_1) && \
339 : (str[0] == PDF_TEXT_LCI_0))
340 : {
341 12 : lang_found = 1;
342 : }
343 : }
344 :
345 : /* If either BOM or Lang Marker are found, process PDF string as encoded
346 : * in UTF16-BE */
347 97 : if(bom_found || lang_found)
348 : {
349 33 : pdf_char_t *string_start = (pdf_char_t *)str;
350 33 : pdf_size_t string_length = size;
351 :
352 : /* Skip 2-bytes BOM */
353 33 : if(bom_found)
354 : {
355 21 : string_start += 2;
356 21 : string_length -= 2;
357 : }
358 :
359 : /* If lang/country code available, obtain and store the information */
360 33 : if((lang_found) && \
361 : (pdf_text_get_lang_from_utf16be(element,
362 : &string_start, &string_length,
363 : string_start, string_length)!=PDF_OK))
364 : {
365 : PDF_DEBUG_BASE("Invalid Lang/Code info detected");
366 0 : pdf_text_destroy(element);
367 0 : return PDF_ETEXTENC;
368 : }
369 :
370 : /* And finally convert to UTF-32... */
371 33 : ret_code = pdf_text_utf16be_to_utf32he(string_start,
372 : string_length,
373 : &(element->data),
374 : &(element->size),
375 : remaining_str,
376 : remaining_length);
377 : }
378 : /* Else, process PDF string as encoded in PDF Doc Encoding */
379 : else
380 : {
381 : /* We already know that this string will be fully stored, without
382 : * splitting in chunks */
383 64 : if(remaining_length != NULL)
384 : {
385 24 : *remaining_length = 0;
386 : }
387 64 : if(remaining_str != NULL)
388 : {
389 24 : *remaining_str = NULL;
390 : }
391 : /* And perform the conversion */
392 64 : ret_code = pdf_text_pdfdocenc_to_utf32he(str,
393 : size,
394 : &(element->data),
395 : &(element->size));
396 : }
397 :
398 : /* Only store in the output element if and only if everything went ok */
399 97 : if(ret_code == PDF_OK)
400 : {
401 86 : *text = element;
402 : }
403 : else
404 : {
405 11 : pdf_text_destroy(element);
406 : }
407 97 : return ret_code;
408 : }
409 :
410 :
411 : pdf_status_t
412 : pdf_text_new_from_unicode (const pdf_char_t *str,
413 : const pdf_size_t size,
414 : const enum pdf_text_unicode_encoding_e enc,
415 : pdf_text_t *text)
416 539 : {
417 539 : pdf_text_t element = NULL;
418 539 : pdf_status_t ret_code = PDF_OK;
419 : pdf_status_t ret_code_new;
420 :
421 539 : if(str == NULL)
422 : {
423 0 : return PDF_EBADDATA;
424 : }
425 :
426 : /* Allocate and initialize element */
427 539 : ret_code_new = pdf_text_new (&element);
428 539 : if (ret_code_new != PDF_OK)
429 : {
430 : /* Oops, element creation failed due to some error... */
431 0 : return ret_code_new;
432 : }
433 :
434 : /* Set Unicode contents */
435 539 : if(size > 0)
436 : {
437 530 : ret_code = pdf_text_set_unicode(element, str, size, enc);
438 : }
439 :
440 539 : if(ret_code == PDF_OK)
441 : {
442 : /* Perfect! Set output variable */
443 499 : *text = element;
444 : }
445 : else
446 : {
447 : /* Conversion went wrong... so destroy object contents */
448 40 : pdf_text_destroy(element);
449 : }
450 :
451 : /* Return status of the conversion */
452 539 : return ret_code;
453 : }
454 :
455 :
456 : pdf_status_t
457 : pdf_text_new_from_u32 (const pdf_u32_t number,
458 : pdf_text_t *text)
459 2 : {
460 : /* Longest number to hold in 32bit: 2^32 = 4294967296 (10 chars) */
461 : pdf_char_t temp[10 + 1];
462 : pdf_size_t n;
463 :
464 : /* Print number in temporal char array, and get number of output chars */
465 2 : n = sprintf((char *)&temp[0],"%u",(unsigned int)number);
466 :
467 : /* At least one char should have been printed! */
468 2 : if(n > 0)
469 : {
470 : /* Treat the generated string as UTF-8 encoded (just numbers in ASCII) */
471 2 : return pdf_text_new_from_unicode (&temp[0], n, PDF_TEXT_UTF8, text);
472 : }
473 : else
474 : {
475 : PDF_DEBUG_BASE("Invalid u32 received: %u", (unsigned int)number);
476 0 : return PDF_EBADTEXT;
477 : }
478 : }
479 :
480 :
481 : /* Return the country associated with a text variable */
482 : const pdf_char_t *
483 : pdf_text_get_country (const pdf_text_t text)
484 154 : {
485 262 : return (const pdf_char_t *)text->country;
486 : }
487 :
488 : /* Return the language associated with a text variable */
489 : const pdf_char_t *
490 : pdf_text_get_language (const pdf_text_t text)
491 222 : {
492 348 : return (const pdf_char_t *)text->lang;
493 : }
494 :
495 : /* Associate a text variable (full text) with a country code */
496 : pdf_status_t
497 : pdf_text_set_country (pdf_text_t text,
498 : const pdf_char_t *code)
499 154 : {
500 154 : if((code == NULL) || \
501 : (strlen((char *)code) != (PDF_TEXT_CCL-1)))
502 : {
503 2 : return PDF_EBADDATA;
504 : }
505 :
506 152 : memcpy(&(text->country[0]), code, PDF_TEXT_CCL-1);
507 : /* Make sure that last byte is NUL */
508 152 : text->country[PDF_TEXT_CCL-1] = '\0';
509 152 : return PDF_OK;
510 : }
511 :
512 :
513 : /* Associate a text variable (full text) with a language code */
514 : pdf_status_t
515 : pdf_text_set_language (pdf_text_t text,
516 : const pdf_char_t *code)
517 301 : {
518 301 : if((code == NULL) || \
519 : (strlen((char *)code) != (PDF_TEXT_CCL-1)))
520 : {
521 2 : return PDF_EBADDATA;
522 : }
523 :
524 299 : memcpy(&(text->lang[0]), code, PDF_TEXT_CCL-1);
525 : /* Make sure that last byte is NUL */
526 299 : text->lang[PDF_TEXT_CCL-1] = '\0';
527 299 : return PDF_OK;
528 : }
529 :
530 :
531 : /* Determine if a given text variable is empty (contains no text) */
532 : inline pdf_bool_t
533 : pdf_text_empty_p (const pdf_text_t text)
534 5 : {
535 45 : return ((text->size != 0) ? PDF_FALSE : PDF_TRUE);
536 : }
537 :
538 :
539 : /* Get default system host encoding */
540 : pdf_text_host_encoding_t
541 : pdf_text_get_host_encoding(void)
542 28 : {
543 28 : return pdf_text_context_get_host_encoding();
544 : }
545 :
546 :
547 : /* Check if host encoding is available */
548 : pdf_status_t
549 : pdf_text_check_host_encoding(const pdf_char_t *encoding_name,
550 : pdf_text_host_encoding_t *p_encoding)
551 4 : {
552 : /* Check length of host encoding */
553 4 : if(strlen((char *)encoding_name) >= PDF_TEXT_HENMAXL)
554 : {
555 : PDF_DEBUG_BASE("Encoding name too long!");
556 0 : return PDF_EBADDATA;
557 : }
558 :
559 4 : if(pdf_text_host_encoding_is_available(encoding_name) == PDF_OK)
560 : {
561 3 : strcpy((char *)(&(p_encoding->name[0])), (char *)encoding_name);
562 3 : p_encoding->name[strlen((char *)encoding_name)-1] = '\0';
563 3 : return PDF_OK;
564 : }
565 : else
566 : {
567 1 : return PDF_ETEXTENC;
568 : }
569 : }
570 :
571 :
572 : pdf_text_host_encoding_t
573 : pdf_text_get_best_encoding (pdf_text_t text,
574 : const pdf_text_host_encoding_t preferred_encoding)
575 1 : {
576 : pdf_text_host_encoding_t ret_encoding;
577 : #ifdef PDF_HOST_WIN32
578 : static const pdf_char_t *to_check [3] = {
579 : (pdf_char_t *) "CP65001", /* UTF-8 */
580 : (pdf_char_t *) "CP1200", /* UTF-16LE */
581 : (pdf_char_t *) "CP12000" /* UTF-32LE */
582 : };
583 : #else
584 : static const pdf_char_t *to_check [3] = {
585 : (pdf_char_t *) "UTF-8",
586 : (pdf_char_t *) "UTF-16",
587 : (pdf_char_t *) "UTF-32"
588 : };
589 :
590 : #endif
591 1 : int i = 0;
592 : /* Check for Unicode support as host encoding */
593 1 : for(i = 0; i<3; i++)
594 : {
595 1 : if(pdf_text_check_host_encoding(to_check[i], &ret_encoding) == PDF_OK)
596 : {
597 1 : return ret_encoding;
598 : }
599 : }
600 : /* If host does not support any Unicode encoding conversion, return the
601 : * preferred one directly */
602 0 : return preferred_encoding;
603 : }
604 :
605 :
606 :
607 : pdf_status_t
608 : pdf_text_get_host (pdf_char_t **contents,
609 : pdf_size_t *length,
610 : const pdf_text_t text,
611 : const pdf_text_host_encoding_t enc)
612 33 : {
613 :
614 33 : return pdf_text_utf32he_to_host (text->data, text->size, enc,
615 : contents, length);
616 : }
617 :
618 :
619 : /* Get the contents of a text variable encoded in PDFDocEncoding, as a NUL
620 : * terminated string */
621 : pdf_status_t
622 : pdf_text_get_pdfdocenc (pdf_char_t **contents,
623 : const pdf_text_t text)
624 119 : {
625 : pdf_status_t ret_code;
626 119 : pdf_char_t *data = NULL;
627 119 : pdf_size_t size = -1;
628 :
629 119 : ret_code = pdf_text_utf32he_to_pdfdocenc(text->data, text->size,
630 : &data, &size);
631 :
632 : /* Now, if conversion went ok... */
633 119 : if(ret_code == PDF_OK)
634 : {
635 : /* Add NUL character at the end of the array */
636 119 : data = pdf_realloc(data, size+1);
637 119 : if(data != NULL)
638 : {
639 119 : data[size] = '\0';
640 : /* Set output data... */
641 119 : *contents = data;
642 : }
643 : else
644 : {
645 0 : return PDF_ENOMEM;
646 : }
647 : }
648 : /* else, clear allocated memory, if any */
649 0 : else if(data != NULL)
650 : {
651 0 : pdf_dealloc(data);
652 : }
653 :
654 119 : return ret_code;
655 : }
656 :
657 :
658 : pdf_status_t
659 : pdf_text_get_unicode (pdf_char_t **contents,
660 : pdf_size_t *length,
661 : const pdf_text_t text,
662 : const enum pdf_text_unicode_encoding_e enc,
663 : const pdf_u32_t options)
664 592 : {
665 : pdf_status_t ret_code;
666 : enum pdf_text_unicode_encoding_e new_enc;
667 592 : pdf_char_t *out_data = NULL;
668 592 : pdf_size_t out_length = 0;
669 :
670 : /* Check for invalid options... */
671 592 : if((options & PDF_TEXT_UTF16BE_WITH_LANGCODE) && \
672 : (enc != PDF_TEXT_UTF16_BE))
673 : {
674 : PDF_DEBUG_BASE("Lang/Country info only available for UTF-16BE");
675 : /* Not allowed!!! */
676 164 : return PDF_EBADDATA;
677 : }
678 :
679 : /* If host endianness required, check it and convert input encoding */
680 428 : new_enc = pdf_text_transform_he_to_unicode_encoding(enc);
681 :
682 : /* If text is empty, set empty string */
683 460 : if((text->data == NULL) || \
684 : (text->size == 0))
685 : {
686 32 : out_data = NULL;
687 32 : out_length = 0;
688 32 : ret_code = PDF_OK;
689 : }
690 : else
691 : {
692 : /* Perform conversion */
693 396 : switch(new_enc)
694 : {
695 : case PDF_TEXT_UTF8: /* UTF-8 */
696 23 : ret_code = pdf_text_utf32he_to_utf8(text->data, text->size,
697 : &out_data, &out_length);
698 23 : break;
699 : case PDF_TEXT_UTF16_LE: /* UTF-16LE */
700 20 : ret_code = pdf_text_utf32he_to_utf16le(text->data, text->size,
701 : &out_data, &out_length);
702 20 : break;
703 : case PDF_TEXT_UTF16_BE: /* UTF-16BE */
704 60 : ret_code = pdf_text_utf32he_to_utf16be(text->data, text->size,
705 : &out_data, &out_length);
706 60 : break;
707 : case PDF_TEXT_UTF32_LE: /* UTF-32LE */
708 217 : ret_code = pdf_text_utf32he_to_utf32le(text->data, text->size,
709 : &out_data, &out_length);
710 217 : break;
711 : case PDF_TEXT_UTF32_BE: /* UTF-32BE */
712 76 : ret_code = pdf_text_utf32he_to_utf32be(text->data, text->size,
713 : &out_data, &out_length);
714 76 : break;
715 : default:
716 0 : ret_code = PDF_ETEXTENC;
717 : }
718 : }
719 :
720 : /* Check if specific options were requested */
721 428 : if(options != PDF_TEXT_UNICODE_NO_OPTION)
722 : {
723 : pdf_char_t header[PDF_TEXT_USHMAXL];
724 138 : pdf_size_t header_size = 0;
725 138 : pdf_size_t trailer_size = 0;
726 :
727 : /* Compute header if needed */
728 138 : if((options & PDF_TEXT_UNICODE_WITH_BOM) || \
729 : (options & PDF_TEXT_UTF16BE_WITH_LANGCODE))
730 : {
731 : /* Clear header array */
732 108 : memset(&(header[0]), 0, PDF_TEXT_USHMAXL);
733 : /* Get requested header (BOM and/or lang/country info) */
734 108 : pdf_text_get_unicode_string_header(header,
735 : &header_size,
736 : new_enc,
737 : options,
738 : pdf_text_get_language(text),
739 : pdf_text_get_country(text));
740 : }
741 : /* Compute trailer if needed */
742 138 : if(options & PDF_TEXT_UNICODE_WITH_NUL_SUFFIX)
743 : {
744 84 : switch(new_enc)
745 : {
746 : case PDF_TEXT_UTF8:
747 12 : trailer_size = 1;
748 12 : break;
749 : case PDF_TEXT_UTF16_BE:
750 : case PDF_TEXT_UTF16_LE:
751 : case PDF_TEXT_UTF16_HE:
752 48 : trailer_size = 2;
753 48 : break;
754 : case PDF_TEXT_UTF32_BE:
755 : case PDF_TEXT_UTF32_LE:
756 : case PDF_TEXT_UTF32_HE:
757 24 : trailer_size = 4;
758 24 : break;
759 : default:
760 0 : trailer_size = 0;
761 : break;
762 : }
763 : }
764 :
765 138 : if((header_size > 0) || \
766 : (trailer_size > 0))
767 : {
768 138 : pdf_char_t *new_out_data = NULL;
769 :
770 : /* Allocate memory for new string */
771 138 : new_out_data = (pdf_char_t *)pdf_alloc(out_length + \
772 : header_size + \
773 : trailer_size);
774 138 : if(new_out_data == NULL)
775 : {
776 0 : return PDF_ENOMEM;
777 : }
778 : /* Store header */
779 138 : memcpy(new_out_data, &header[0], header_size);
780 :
781 138 : if((out_data != NULL) && \
782 : (out_length != 0))
783 : {
784 : /* Store unicode data, if any */
785 115 : memcpy(&new_out_data[header_size], out_data, out_length);
786 : /* Reset output data array, if any */
787 115 : pdf_dealloc(out_data);
788 : }
789 :
790 : /* Store trailer (N-byte NUL) */
791 138 : if(trailer_size > 0)
792 : {
793 84 : memset(&new_out_data[out_length+header_size],0,trailer_size);
794 : }
795 :
796 138 : out_data = new_out_data;
797 138 : out_length += (header_size + trailer_size);
798 : }
799 : else
800 : {
801 : PDF_DEBUG_BASE("Invalid unicode option requested (%u)",
802 : (unsigned int)options);
803 : }
804 : }
805 :
806 : /* Only store in the output element if and only if everything went ok */
807 428 : if(ret_code == PDF_OK)
808 : {
809 428 : *contents = out_data;
810 428 : *length = out_length;
811 : }
812 0 : else if(out_data != NULL)
813 : {
814 0 : pdf_dealloc(out_data);
815 : }
816 428 : return ret_code;
817 : }
818 :
819 :
820 : pdf_char_t *
821 : pdf_text_get_hex (const pdf_text_t text,
822 : const pdf_char_t delimiter)
823 2 : {
824 : int i;
825 : int j;
826 : unsigned int new_str_length;
827 : pdf_char_t *new_str;
828 : char new_hex_char [3];
829 :
830 2 : if(text->size > 0)
831 : {
832 : /* Get new string length. If input string has N bytes, we need:
833 : * - 1 byte for last NUL char
834 : * - 2N bytes for hexadecimal char representation of each byte...
835 : * - N-1 bytes for the separator ':'
836 : * So... a total of (1+2N+N-1) = 3N bytes are needed... */
837 1 : new_str_length = 3 * text->size;
838 :
839 : /* Allocate memory for new array and initialize contents to NUL */
840 1 : new_str = (pdf_char_t *)pdf_alloc(new_str_length);
841 1 : if(new_str != NULL)
842 : {
843 1 : memset(new_str, 0, new_str_length);
844 :
845 : /* Print hexadecimal representation of each byte... */
846 9 : for(i=0, j=0; i<text->size; i++, j+=3)
847 : {
848 : /* Clear helper array... */
849 8 : memset(&new_hex_char[0], 0, 3);
850 : /* Print character in helper array... */
851 8 : sprintf( new_hex_char, "%02X", (unsigned int)text->data[i]);
852 : /* Copy to output string... */
853 8 : memcpy(&new_str[j],&new_hex_char[0],2);
854 : /* And if needed, add separator */
855 8 : if(i != (text->size-1) )
856 : {
857 7 : new_str[j+2] = delimiter;
858 : }
859 : }
860 : }
861 : }
862 : else
863 : {
864 1 : new_str = (pdf_char_t *)pdf_alloc(1);
865 1 : if(new_str != NULL)
866 : {
867 1 : new_str[0] = '\0';
868 : }
869 : }
870 : /* Set output string */
871 2 : return new_str;
872 : }
873 :
874 :
875 : pdf_status_t
876 : pdf_text_set_host (pdf_text_t text,
877 : const pdf_char_t *str,
878 : const pdf_size_t size,
879 : const pdf_text_host_encoding_t enc)
880 10 : {
881 : pdf_status_t ret_code;
882 : pdf_char_t *temp_data;
883 : pdf_size_t temp_size;
884 :
885 10 : if(str == NULL)
886 : {
887 0 : return PDF_EBADDATA;
888 : }
889 :
890 10 : ret_code = pdf_text_host_to_utf32he (str, size, enc,
891 : &temp_data, &temp_size);
892 10 : if(ret_code == PDF_OK)
893 : {
894 : /* Destroy previous contents of text variable, if any */
895 6 : pdf_text_clean_contents(text);
896 :
897 : /* Really set contents */
898 6 : text->data = temp_data;
899 6 : text->size = temp_size;
900 : }
901 10 : return ret_code;
902 : }
903 :
904 :
905 : /* Set PDF Doc Endoded string */
906 : pdf_status_t
907 : pdf_text_set_pdfdocenc (pdf_text_t text,
908 : const pdf_char_t *str)
909 113 : {
910 : pdf_status_t ret_code;
911 : pdf_char_t *temp_data;
912 : pdf_size_t temp_size;
913 :
914 113 : if(str == NULL)
915 : {
916 0 : return PDF_EBADDATA;
917 : }
918 :
919 113 : ret_code = pdf_text_pdfdocenc_to_utf32he (str, strlen((char *)str),
920 : &temp_data, &temp_size);
921 113 : if(ret_code == PDF_OK)
922 : {
923 : /* Destroy previous contents of text variable, if any */
924 112 : pdf_text_clean_contents(text);
925 :
926 : /* Really set contents */
927 112 : text->data = temp_data;
928 112 : text->size = temp_size;
929 : }
930 113 : return ret_code;
931 : }
932 :
933 :
934 : pdf_status_t
935 : pdf_text_set_unicode (pdf_text_t text,
936 : const pdf_char_t *str,
937 : const pdf_size_t size,
938 : const enum pdf_text_unicode_encoding_e enc)
939 693 : {
940 693 : pdf_status_t ret_code = PDF_ETEXTENC;
941 : pdf_char_t *temp_data;
942 : pdf_size_t temp_size;
943 : enum pdf_text_unicode_encoding_e new_enc;
944 :
945 693 : if((str == NULL) || \
946 : (size == 0))
947 : {
948 0 : return PDF_EBADDATA;
949 : }
950 :
951 : /* If host endianness required, check it and convert input encoding */
952 693 : new_enc = pdf_text_transform_he_to_unicode_encoding(enc);
953 :
954 693 : switch(new_enc)
955 : {
956 : case PDF_TEXT_UTF8: /* UTF-8 */
957 127 : ret_code = pdf_text_utf8_to_utf32he(str, size,
958 : &temp_data, &temp_size);
959 127 : break;
960 : case PDF_TEXT_UTF16_LE: /* UTF-16LE */
961 70 : ret_code = pdf_text_utf16le_to_utf32he(str, size,
962 : &temp_data, &temp_size);
963 70 : break;
964 : case PDF_TEXT_UTF16_BE: /* UTF-16BE */
965 45 : ret_code = pdf_text_utf16be_to_utf32he(str, size,
966 : &temp_data, &temp_size,
967 : NULL, NULL);
968 45 : break;
969 : case PDF_TEXT_UTF32_LE: /* UTF-32LE */
970 70 : ret_code = pdf_text_utf32le_to_utf32he(str, size,
971 : &temp_data, &temp_size);
972 70 : break;
973 : case PDF_TEXT_UTF32_BE: /* UTF-32BE */
974 381 : ret_code = pdf_text_utf32be_to_utf32he(str, size,
975 : &temp_data, &temp_size);
976 381 : break;
977 : default:
978 0 : ret_code = PDF_EBADDATA;
979 : }
980 :
981 693 : if(ret_code == PDF_OK)
982 : {
983 : /* Destroy previous contents of text variable, if any */
984 573 : pdf_text_clean_contents(text);
985 :
986 : /* Really set contents */
987 573 : text->data = temp_data;
988 573 : text->size = temp_size;
989 : }
990 693 : return ret_code;
991 : }
992 :
993 :
994 : /* Concatenate the two text variables, only if country/lang info is equal */
995 : pdf_status_t
996 : pdf_text_concat (pdf_text_t text1,
997 : const pdf_text_t text2,
998 : const pdf_bool_t override_langinfo)
999 16 : {
1000 16 : if(!override_langinfo)
1001 : {
1002 : /* An error will be returned if lang code is different */
1003 7 : if(strcmp((char *)text1->lang, (char *)text2->lang) != 0)
1004 : {
1005 3 : return PDF_ETEXTENC;
1006 : }
1007 :
1008 : /* An error will be returned if country code is different */
1009 4 : if(strcmp((char *)text1->country, (char *)text2->country) != 0)
1010 : {
1011 0 : return PDF_ETEXTENC;
1012 : }
1013 : }
1014 :
1015 : /* Ok, so language/country info is equal or non-existent, start
1016 : * concatenation */
1017 13 : if(text2->size > 0)
1018 : {
1019 : pdf_char_t * tmp;
1020 : /* Re-allocate memory in first text element */
1021 9 : tmp = (pdf_char_t *)pdf_realloc (text1->data,
1022 : text1->size + text2->size);
1023 :
1024 9 : if (tmp == NULL)
1025 : {
1026 0 : return PDF_ENOMEM;
1027 : }
1028 :
1029 9 : text1->data = tmp;
1030 :
1031 : /* Copy contents of second element after the first one */
1032 9 : memcpy(&(text1->data[text1->size]), text2->data, text2->size);
1033 :
1034 : /* Update size of first element */
1035 9 : text1->size += text2->size;
1036 :
1037 9 : text1->modified = PDF_TRUE;
1038 : }
1039 :
1040 13 : return PDF_OK;
1041 : }
1042 :
1043 :
1044 : /* Concatenate a text variable with an ascii string */
1045 : pdf_status_t
1046 : pdf_text_concat_ascii (pdf_text_t text1,
1047 : const pdf_char_t * ascii_str)
1048 4 : {
1049 : pdf_size_t len;
1050 :
1051 4 : len = (pdf_size_t) strlen ((char*)ascii_str);
1052 4 : if (!pdf_text_is_ascii7 (ascii_str, len))
1053 : {
1054 0 : return PDF_EBADDATA;
1055 : }
1056 :
1057 : /* now convert to utf32he and concatenate */
1058 4 : if(len > 0)
1059 : {
1060 : pdf_char_t * newbuf;
1061 : pdf_status_t ret;
1062 : pdf_char_t *tmp_data;
1063 : pdf_size_t tmp_size;
1064 :
1065 : /* ascii string is valid utf8 */
1066 2 : ret = pdf_text_utf8_to_utf32he (ascii_str, len, &tmp_data, &tmp_size);
1067 2 : if (ret != PDF_OK)
1068 : {
1069 0 : return ret;
1070 : }
1071 :
1072 2 : newbuf = (pdf_char_t *)pdf_realloc (text1->data, text1->size + tmp_size);
1073 2 : if (newbuf == NULL)
1074 : {
1075 0 : return PDF_ENOMEM;
1076 : }
1077 : else
1078 : {
1079 2 : text1->data = newbuf;
1080 : }
1081 :
1082 2 : memcpy (&(text1->data[text1->size]), tmp_data, tmp_size);
1083 2 : text1->size += tmp_size;
1084 2 : pdf_dealloc (tmp_data);
1085 : }
1086 :
1087 4 : return PDF_OK;
1088 : }
1089 :
1090 :
1091 : /* Default initial size of the list of replacements */
1092 : #define PDF_TEXT_ISLR 32
1093 :
1094 : /* Replace a given pattern in a text object */
1095 :
1096 : pdf_status_t
1097 : pdf_text_replace (pdf_text_t text,
1098 : const pdf_text_t new_pattern,
1099 : const pdf_text_t old_pattern)
1100 24 : {
1101 24 : return pdf_text_replace_multiple(text, new_pattern, &old_pattern, 1);
1102 : }
1103 :
1104 : typedef struct pdf_text_repl_s {
1105 : pdf_char_t *data_ptr;
1106 : int old_pattern_i;
1107 : } pdf_text_repl_t;
1108 :
1109 :
1110 : /* Check replacement patterns and get minimum size */
1111 : static pdf_status_t
1112 : pdf_text_check_replacement_patterns(const pdf_text_t *p_old_patterns,
1113 : const int n_old_patterns,
1114 : pdf_size_t *p_min_old_pattern_size)
1115 : {
1116 28 : pdf_size_t minimum_old_pattern_size = -1;
1117 : int i_pattern;
1118 :
1119 66 : for(i_pattern = 0; i_pattern < n_old_patterns; ++i_pattern)
1120 : {
1121 : /* Get minimum old pattern size */
1122 40 : if((i_pattern == 0) || \
1123 : ((p_old_patterns[i_pattern])->size < minimum_old_pattern_size))
1124 : {
1125 32 : minimum_old_pattern_size = (p_old_patterns[i_pattern])->size;
1126 : }
1127 : /* Empty old pattern is not allowed */
1128 80 : if(pdf_text_empty_p(p_old_patterns[i_pattern]))
1129 : {
1130 2 : return PDF_ETEXTENC;
1131 : }
1132 : }
1133 :
1134 : /* Set output var and exit correctly */
1135 26 : *p_min_old_pattern_size = minimum_old_pattern_size;
1136 26 : return PDF_OK;
1137 : }
1138 :
1139 : pdf_status_t
1140 : pdf_text_get_replacement_pointers(pdf_text_repl_t **p_rep_ptrs, \
1141 : long *p_n_replacements, \
1142 : pdf_size_t *p_new_size, \
1143 : const pdf_text_t text, \
1144 : const pdf_size_t minimum_old_pattern_size, \
1145 : const pdf_text_t new_pattern, \
1146 : const pdf_text_t *p_old_patterns, \
1147 : const int n_old_patterns)
1148 20 : {
1149 : pdf_size_t new_size;
1150 : int i_pattern;
1151 : long i;
1152 : long n_replacements;
1153 20 : pdf_text_repl_t *rep_ptrs = NULL;
1154 20 : long rep_ptrs_size = PDF_TEXT_ISLR/2;
1155 :
1156 20 : n_replacements = 0;
1157 20 : i = 0;
1158 20 : new_size = 0;
1159 596 : while(i <= (text->size - minimum_old_pattern_size))
1160 : {
1161 : /* If old pattern found... */
1162 556 : int old_pattern_found = 0;
1163 556 : i_pattern = 0;
1164 2040 : while((!old_pattern_found) && \
1165 : (i_pattern < n_old_patterns))
1166 : {
1167 962 : if(((text->size - i) >= ((p_old_patterns[i_pattern])->size)) && \
1168 : (memcmp(&(text->data[i]), \
1169 : (p_old_patterns[i_pattern])->data,
1170 : (p_old_patterns[i_pattern])->size)==0))
1171 : {
1172 34 : old_pattern_found = 1;
1173 : /* Duplicate size of replacement pointers list, if needed */
1174 34 : if((rep_ptrs == NULL) || \
1175 : (rep_ptrs_size == n_replacements))
1176 : {
1177 12 : rep_ptrs = (pdf_text_repl_t *)pdf_realloc(rep_ptrs,
1178 : 2 * rep_ptrs_size * \
1179 : sizeof(pdf_text_repl_t));
1180 12 : if(rep_ptrs == NULL)
1181 : {
1182 0 : return PDF_ENOMEM;
1183 : }
1184 : }
1185 : /* Store pointer to old pattern */
1186 34 : rep_ptrs[n_replacements].data_ptr = &(text->data[i]);
1187 34 : rep_ptrs[n_replacements].old_pattern_i = i_pattern;
1188 34 : n_replacements++;
1189 : /* The index must be updated to skip the replacement */
1190 34 : i += (p_old_patterns[i_pattern])->size;
1191 : /* Update new size */
1192 34 : new_size += new_pattern->size;
1193 : }
1194 : else
1195 : {
1196 894 : i_pattern++;
1197 : }
1198 : }
1199 556 : if(!old_pattern_found)
1200 : {
1201 522 : i+=4;
1202 522 : new_size +=4;
1203 : }
1204 : }
1205 :
1206 : /* Udpate new size with remaining data in old array */
1207 20 : new_size += (text->size - i);
1208 :
1209 : /* Set output data and exit correctly */
1210 20 : *p_new_size = new_size;
1211 20 : *p_rep_ptrs = rep_ptrs;
1212 20 : *p_n_replacements = n_replacements;
1213 :
1214 20 : return PDF_OK;
1215 : }
1216 :
1217 : static pdf_status_t
1218 : pdf_text_perform_replacements(pdf_text_t text, \
1219 : const pdf_size_t new_size, \
1220 : const pdf_text_t new_pattern, \
1221 : const pdf_text_t *p_old_patterns, \
1222 : const int n_old_patterns, \
1223 : const pdf_text_repl_t *rep_ptrs, \
1224 : const long n_replacements)
1225 : {
1226 : int k;
1227 : pdf_char_t *new_data;
1228 : pdf_char_t *new_walker;
1229 : pdf_char_t *old_walker;
1230 :
1231 : /* Allocate new memory chunk */
1232 12 : new_data = (pdf_char_t *)pdf_alloc(new_size);
1233 :
1234 : /* Walk the list of replacements */
1235 12 : new_walker = new_data;
1236 12 : old_walker = text->data;
1237 46 : for(k = 0; k < n_replacements; ++k)
1238 : {
1239 : pdf_size_t prev_size;
1240 : /* Store the data previous to the pointer */
1241 34 : prev_size = (rep_ptrs[k].data_ptr - old_walker);
1242 34 : if(prev_size > 0)
1243 : {
1244 26 : memcpy(new_walker, old_walker, prev_size);
1245 26 : new_walker += prev_size;
1246 26 : old_walker += prev_size;
1247 : }
1248 : /* Perform the replacement */
1249 34 : memcpy(new_walker, new_pattern->data, new_pattern->size);
1250 34 : new_walker += (new_pattern->size);
1251 34 : old_walker += (p_old_patterns[rep_ptrs[k].old_pattern_i]->size);
1252 : }
1253 :
1254 : /* Add final data */
1255 12 : if(((&(text->data[text->size])) - old_walker) > 0)
1256 : {
1257 2 : memcpy(new_walker, old_walker, \
1258 : ((&(text->data[text->size])) - old_walker));
1259 : }
1260 :
1261 : /* Set correct final size and final content */
1262 12 : pdf_dealloc(text->data);
1263 12 : text->data = new_data;
1264 12 : text->size = new_size;
1265 :
1266 12 : return PDF_OK;
1267 : }
1268 :
1269 :
1270 : pdf_status_t
1271 : pdf_text_replace_multiple (pdf_text_t text,
1272 : const pdf_text_t new_pattern,
1273 : const pdf_text_t *p_old_patterns,
1274 : const int n_old_patterns)
1275 28 : {
1276 28 : pdf_size_t new_size = 0;
1277 28 : pdf_size_t minimum_old_pattern_size = -1;
1278 : long n_replacements;
1279 28 : pdf_text_repl_t *rep_ptrs = NULL;
1280 :
1281 28 : if((p_old_patterns == NULL) || \
1282 : (n_old_patterns == 0))
1283 : {
1284 0 : return PDF_EBADDATA;
1285 : }
1286 :
1287 28 : if(pdf_text_check_replacement_patterns(p_old_patterns, \
1288 : n_old_patterns, \
1289 : &minimum_old_pattern_size) != PDF_OK)
1290 : {
1291 : PDF_DEBUG_BASE("At least one old pattern is not valid");
1292 : /* At least one old pattern is not valid */
1293 2 : return PDF_ETEXTENC;
1294 : }
1295 :
1296 : /* If input text is shorter than the smallest old pattern, there is no
1297 : * replacement to be done */
1298 26 : if(minimum_old_pattern_size > text->size)
1299 : {
1300 6 : return PDF_OK;
1301 : }
1302 :
1303 : /* First, count number of replacements to be done... a replacement pointer
1304 : * will be stored for each replacement needed */
1305 20 : if(pdf_text_get_replacement_pointers(&rep_ptrs, \
1306 : &n_replacements, \
1307 : &new_size, \
1308 : text, \
1309 : minimum_old_pattern_size, \
1310 : new_pattern, \
1311 : p_old_patterns, \
1312 : n_old_patterns) != PDF_OK)
1313 : {
1314 : PDF_DEBUG_BASE("Error getting replacement pointers");
1315 0 : return PDF_ETEXTENC;
1316 : }
1317 :
1318 : /* Now, really perform replacements, if required */
1319 20 : if(n_replacements > 0)
1320 : {
1321 12 : pdf_text_perform_replacements(text, \
1322 : new_size, \
1323 : new_pattern, \
1324 : p_old_patterns, \
1325 : n_old_patterns, \
1326 : rep_ptrs, \
1327 : n_replacements);
1328 12 : if(rep_ptrs != NULL)
1329 : {
1330 : /* Dealloc list of pointers to replacements */
1331 12 : pdf_dealloc(rep_ptrs);
1332 : }
1333 :
1334 12 : text->modified = PDF_TRUE;
1335 : }
1336 :
1337 20 : return PDF_OK;
1338 : }
1339 :
1340 :
1341 : /* Replace a given ASCII-7 pattern in a text object */
1342 : pdf_status_t
1343 : pdf_text_replace_ascii (pdf_text_t text,
1344 : const pdf_char_t *new_pattern,
1345 : const pdf_char_t *old_pattern)
1346 14 : {
1347 : /* Check if patterns are real ASCII-7 valid strings */
1348 14 : if((!pdf_text_is_ascii7(old_pattern,
1349 : (pdf_size_t)strlen((char *)old_pattern))) || \
1350 : (!pdf_text_is_ascii7(new_pattern,
1351 : (pdf_size_t)strlen((char *)new_pattern))))
1352 : {
1353 : PDF_DEBUG_BASE("At least one of the requested patterns is not "
1354 : "7-bit ASCII");
1355 0 : return PDF_EBADDATA;
1356 : }
1357 : else
1358 : {
1359 : /* Ok, so load ASCII strings as if it were UTF-8 strings */
1360 : pdf_text_t new_pattern_text;
1361 : pdf_text_t old_pattern_text;
1362 : pdf_status_t ret_code;
1363 :
1364 : /* Create intermediate pdf_text_t variables */
1365 14 : if(pdf_text_new_from_unicode(new_pattern,
1366 : (pdf_size_t) strlen ((char *) new_pattern),
1367 : PDF_TEXT_UTF8,
1368 : &new_pattern_text) != PDF_OK)
1369 : {
1370 : PDF_DEBUG_BASE("Error creating pdf_text_t from ASCII new pattern");
1371 0 : return PDF_EBADTEXT;
1372 : }
1373 14 : if(pdf_text_new_from_unicode(old_pattern,
1374 : (pdf_size_t) strlen ((char *)old_pattern),
1375 : PDF_TEXT_UTF8,
1376 : &old_pattern_text) != PDF_OK)
1377 : {
1378 : PDF_DEBUG_BASE("Error creating pdf_text_t from ASCII old pattern");
1379 0 : return PDF_EBADTEXT;
1380 : }
1381 :
1382 : /* Perform replacement */
1383 14 : ret_code = pdf_text_replace(text, new_pattern_text, old_pattern_text);
1384 :
1385 : /* Destroy used intermediate variables */
1386 14 : pdf_text_destroy(new_pattern_text);
1387 14 : pdf_text_destroy(old_pattern_text);
1388 :
1389 14 : return ret_code;
1390 : }
1391 : }
1392 :
1393 :
1394 : pdf_status_t
1395 : pdf_text_filter (pdf_text_t text,
1396 : const pdf_u32_t filter)
1397 63 : {
1398 : /* More than one filter at the same time can be requested! But Caution!
1399 : * UpperCase filter, LowerCase filter and TitleCase filter are mutually
1400 : * exclusive (at most only one of them must be enabled) */
1401 :
1402 63 : if((((filter & PDF_TEXT_FILTER_UPPER_CASE) ? 1 : 0) + \
1403 : ((filter & PDF_TEXT_FILTER_LOWER_CASE) ? 1 : 0) + \
1404 : ((filter & PDF_TEXT_FILTER_TITLE_CASE) ? 1 : 0)) > 1)
1405 : {
1406 : PDF_DEBUG_BASE("At most only one case conversion filter can be applied");
1407 0 : return PDF_EBADDATA;
1408 : }
1409 :
1410 : /* 0x00000001 */
1411 63 : if((filter & PDF_TEXT_FILTER_LINE_ENDINGS) && \
1412 : (pdf_text_filter_normalize_line_endings(text) != PDF_OK))
1413 : {
1414 : PDF_DEBUG_BASE("Error applying Line Ending normalization filter");
1415 0 : return PDF_ETEXTENC;
1416 : }
1417 :
1418 : /* 0x00000010 */
1419 63 : if((filter & PDF_TEXT_FILTER_UPPER_CASE) && \
1420 : (pdf_text_filter_upper_case(text) != PDF_OK))
1421 : {
1422 : PDF_DEBUG_BASE("Error applying Upper Case filter");
1423 0 : return PDF_ETEXTENC;
1424 : }
1425 : /* 0x00000100 */
1426 63 : else if((filter & PDF_TEXT_FILTER_LOWER_CASE) && \
1427 : (pdf_text_filter_lower_case(text) != PDF_OK))
1428 : {
1429 : PDF_DEBUG_BASE("Error applying Lower Case filter");
1430 0 : return PDF_ETEXTENC;
1431 : }
1432 : /* 0x00001000 */
1433 63 : else if((filter & PDF_TEXT_FILTER_TITLE_CASE) && \
1434 : (pdf_text_filter_title_case(text) != PDF_OK))
1435 : {
1436 : PDF_DEBUG_BASE("Error applying Title Case filter");
1437 0 : return PDF_ETEXTENC;
1438 : }
1439 :
1440 : /* 0x00010000 */
1441 63 : if((filter & PDF_TEXT_FILTER_REMOVE_AMP) && \
1442 : (pdf_text_filter_remove_amp(text) != PDF_OK))
1443 : {
1444 : PDF_DEBUG_BASE("Error applying Ampersand Removal filter");
1445 0 : return PDF_ETEXTENC;
1446 : }
1447 :
1448 : /* 0x00100000 */
1449 63 : if((filter & PDF_TEXT_FILTER_NORM_WITH_FULL_WIDTH) && \
1450 : (pdf_text_filter_normalize_full_width_ascii(text) != PDF_OK))
1451 : {
1452 : PDF_DEBUG_BASE("Error applying FullWidth Normalization filter");
1453 0 : return PDF_ETEXTENC;
1454 : }
1455 :
1456 : /* 0x01000000 */
1457 63 : if ((filter & PDF_TEXT_FILTER_REMOVE_LINE_ENDINGS) &&
1458 : (pdf_text_filter_remove_line_endings (text) != PDF_OK))
1459 : {
1460 : PDF_DEBUG_BASE ("Error applying Line Ending Removal filter");
1461 0 : return PDF_ETEXTENC;
1462 : }
1463 :
1464 63 : text->modified = PDF_TRUE;
1465 63 : return PDF_OK;
1466 : }
1467 :
1468 : const pdf_char_t *
1469 : pdf_text_get_printable (pdf_text_t text)
1470 0 : {
1471 : pdf_size_t size;
1472 :
1473 0 : if (text->printable != NULL){
1474 0 : if (text->modified == PDF_FALSE){
1475 0 : return text->printable;
1476 : }else{
1477 0 : pdf_dealloc (text->printable);
1478 : }
1479 : }
1480 :
1481 : #ifdef PDF_HOST_WIN32
1482 : pdf_text_get_unicode (&text->printable, &size, text, PDF_TEXT_UTF16_LE,
1483 : PDF_TEXT_UNICODE_WITH_NUL_SUFFIX);
1484 : #else
1485 0 : pdf_text_get_unicode (&text->printable, &size, text, PDF_TEXT_UTF8,
1486 : PDF_TEXT_UNICODE_WITH_NUL_SUFFIX);
1487 : #endif /*PDF_HOST_WIN32*/
1488 :
1489 0 : text->modified = PDF_FALSE;
1490 :
1491 0 : return text->printable;
1492 : }
1493 :
1494 :
1495 : pdf_i32_t
1496 : pdf_text_cmp (const pdf_text_t text1,
1497 : const pdf_text_t text2,
1498 : const pdf_bool_t case_sensitive,
1499 : pdf_status_t *p_ret_code)
1500 9 : {
1501 9 : if(p_ret_code != NULL)
1502 : {
1503 6 : *p_ret_code = PDF_OK;
1504 : }
1505 :
1506 : /* Compare sizes of the texts */
1507 9 : if(text1->size != text2->size)
1508 : {
1509 2 : return ((text1->size > text2->size) ? 1 : -1);
1510 : }
1511 :
1512 7 : if(case_sensitive == PDF_TRUE)
1513 : {
1514 5 : return memcmp(text1->data, text2->data, text1->size);
1515 : }
1516 : else
1517 : {
1518 2 : return pdf_text_cmp_non_case_sensitive(text1, text2, p_ret_code);
1519 : }
1520 : }
1521 :
1522 :
1523 : /* -------------------------- Private functions ----------------------------- */
1524 :
1525 : static pdf_i32_t
1526 : pdf_text_cmp_non_case_sensitive(pdf_text_t text1,
1527 : pdf_text_t text2,
1528 : pdf_status_t *p_ret_code)
1529 2 : {
1530 : /* Generate word boundaries list, if not already done */
1531 2 : if((pdf_text_fill_word_boundaries_list(text1->word_boundaries, \
1532 : text1->data, \
1533 : text1->size) == PDF_OK) && \
1534 : (pdf_text_fill_word_boundaries_list(text2->word_boundaries, \
1535 : text2->data, \
1536 : text2->size) == PDF_OK))
1537 : {
1538 : pdf_size_t size1;
1539 : pdf_size_t size2;
1540 :
1541 4 : size1 = pdf_list_size(text1->word_boundaries);
1542 4 : size2 = pdf_list_size(text2->word_boundaries);
1543 : /* First, compare number of words in each text */
1544 2 : if(size1 != size2)
1545 : {
1546 : PDF_DEBUG_BASE("Different sizes...");
1547 0 : return ((size1 > size2) ? 1 : -1);
1548 : }
1549 : else
1550 : {
1551 : /* Perform a word-per-word lower case comparison! */
1552 : int n;
1553 :
1554 : /* Get word from both texts */
1555 2 : n = 0;
1556 22 : while(n < size1)
1557 : {
1558 : struct pdf_text_wb_s *p_word1;
1559 : struct pdf_text_wb_s *p_word2;
1560 : pdf_i32_t ret_num;
1561 :
1562 18 : if(pdf_list_get_at(text1->word_boundaries, \
1563 : n, \
1564 : (const void **)&p_word1) != PDF_OK)
1565 : {
1566 0 : *p_ret_code = PDF_ETEXTENC;
1567 : PDF_DEBUG_BASE("Error getting word '%d' from text1", n);
1568 : /* An error happened computing word boundaries! */
1569 0 : return -1;
1570 : }
1571 :
1572 18 : if(pdf_list_get_at(text2->word_boundaries,
1573 : n,
1574 : (const void **)&p_word2) != PDF_OK)
1575 : {
1576 0 : *p_ret_code = PDF_ETEXTENC;
1577 : PDF_DEBUG_BASE("Error getting word '%d' from text2", n);
1578 : /* An error happened computing word boundaries! */
1579 0 : return -1;
1580 : }
1581 :
1582 18 : ret_num = pdf_text_compare_words(p_word1->word_start,
1583 : p_word1->word_size,
1584 : p_word2->word_start,
1585 : p_word2->word_size,
1586 : pdf_text_get_language(text1),
1587 : pdf_text_get_language(text2),
1588 : p_ret_code);
1589 : /* If words are not equal, return the code */
1590 18 : if(ret_num != 0)
1591 : {
1592 : PDF_DEBUG_BASE("Words are not equal...");
1593 0 : return ret_num;
1594 : }
1595 18 : ++n;
1596 : }
1597 : /* If arrived here, the strings are completely equal */
1598 2 : return 0;
1599 : }
1600 : }
1601 : else
1602 : {
1603 0 : if(p_ret_code != NULL)
1604 : {
1605 0 : *p_ret_code = PDF_ETEXTENC;
1606 : }
1607 : PDF_DEBUG_BASE("Problem computing word boundaries. Comparison is not"
1608 : " valid");
1609 0 : return -1; /* An error happened computing word boundaries! */
1610 : }
1611 : return 0;
1612 : }
1613 :
1614 :
1615 : static pdf_i32_t
1616 : pdf_text_compare_words(const pdf_char_t *word1,
1617 : const pdf_size_t size1,
1618 : const pdf_char_t *word2,
1619 : const pdf_size_t size2,
1620 : const pdf_char_t *language1,
1621 : const pdf_char_t *language2,
1622 : pdf_status_t *p_ret_code)
1623 18 : {
1624 : pdf_char_t *lower1;
1625 : pdf_char_t *lower2;
1626 : pdf_size_t new_size1;
1627 : pdf_size_t new_size2;
1628 : pdf_size_t worst_size;
1629 :
1630 18 : if(p_ret_code != NULL)
1631 : {
1632 18 : *p_ret_code = PDF_OK;
1633 : }
1634 :
1635 : /* Compare sizes of words */
1636 18 : if(size1 != size2)
1637 : {
1638 0 : return ((size1 > size2) ? 1 : -1);
1639 : }
1640 :
1641 : /* Compute new worst word length */
1642 18 : worst_size = size1 * UCD_SC_MAX_EXPAND;
1643 :
1644 : /* Allocate memory for lowercases */
1645 18 : lower1 = (pdf_char_t *)pdf_alloc(worst_size);
1646 18 : lower2 = (pdf_char_t *)pdf_alloc(worst_size);
1647 18 : if((lower1 == NULL) || \
1648 : (lower2 == NULL))
1649 : {
1650 : PDF_DEBUG_BASE("Unable to compare words");
1651 0 : if(p_ret_code != NULL)
1652 : {
1653 0 : *p_ret_code = PDF_ENOMEM;
1654 : }
1655 0 : if(lower1 != NULL)
1656 : {
1657 0 : pdf_dealloc(lower1);
1658 : }
1659 0 : if(lower2 != NULL)
1660 : {
1661 0 : pdf_dealloc(lower2);
1662 : }
1663 0 : return -1;
1664 : }
1665 :
1666 : /* Lowercase words */
1667 18 : if(pdf_text_ucd_word_change_case(lower1, &new_size1,
1668 : UNICODE_CASE_INFO_LOWER_CASE,
1669 : word1, size1, language1)!= PDF_OK)
1670 : {
1671 : PDF_DEBUG_BASE("Problem lowercasing word 1");
1672 0 : pdf_dealloc(lower1);
1673 0 : pdf_dealloc(lower2);
1674 0 : if(p_ret_code != NULL)
1675 : {
1676 0 : *p_ret_code = PDF_ETEXTENC;
1677 : }
1678 0 : return -1;
1679 : }
1680 18 : if(pdf_text_ucd_word_change_case(lower2, &new_size2,
1681 : UNICODE_CASE_INFO_LOWER_CASE,
1682 : word2, size2, language2)!= PDF_OK)
1683 : {
1684 : PDF_DEBUG_BASE("Problem lowercasing word 2");
1685 0 : pdf_dealloc(lower1);
1686 0 : pdf_dealloc(lower2);
1687 0 : if(p_ret_code != NULL)
1688 : {
1689 0 : *p_ret_code = PDF_ETEXTENC;
1690 : }
1691 0 : return -1;
1692 : }
1693 :
1694 : /* Compare NEW sizes of words */
1695 18 : if(new_size1 != new_size2)
1696 : {
1697 0 : pdf_dealloc(lower1);
1698 0 : pdf_dealloc(lower2);
1699 0 : return ((new_size1 > new_size2) ? 1 : -1);
1700 : }
1701 : else
1702 : {
1703 : /* Compare contents of words */
1704 : pdf_i32_t ret_val;
1705 18 : ret_val = memcmp(lower1, lower2, new_size1);
1706 18 : pdf_dealloc(lower1);
1707 18 : pdf_dealloc(lower2);
1708 18 : return ret_val;
1709 : }
1710 : }
1711 :
1712 :
1713 : /* Function to clean all contents of a given pdf_text_t variable */
1714 : void
1715 : pdf_text_clean_contents(pdf_text_t text)
1716 691 : {
1717 : /* Clear all contents */
1718 691 : if(text->data != NULL)
1719 : {
1720 0 : pdf_dealloc(text->data);
1721 0 : text->data = NULL;
1722 : }
1723 :
1724 : /* Clean list of word breaks (destroy and create empty) */
1725 691 : pdf_text_clean_word_boundaries_list(&(text->word_boundaries));
1726 :
1727 : /* Clean country and language info */
1728 691 : memset(&(text->lang[0]), 0, PDF_TEXT_CCL);
1729 691 : memset(&(text->country[0]), 0, PDF_TEXT_CCL);
1730 : /* Reset data size */
1731 691 : text->size = 0;
1732 :
1733 691 : text->modified = PDF_FALSE;
1734 691 : if (text->printable != NULL){
1735 0 : pdf_dealloc (text->printable);
1736 0 : text->printable = NULL;
1737 : }
1738 691 : }
1739 :
1740 :
1741 :
1742 : static pdf_status_t
1743 : pdf_text_get_lang_from_utf16be(pdf_text_t element,
1744 : pdf_char_t **str_out,
1745 : pdf_size_t *str_out_length,
1746 : const pdf_char_t *str_in,
1747 : const pdf_size_t str_in_length)
1748 28 : {
1749 : /* Country code is optional */
1750 28 : short country_available = PDF_FALSE;
1751 : pdf_char_t aux[PDF_TEXT_CCL];
1752 :
1753 : /* Check first code marker and MINIMUM length of array */
1754 28 : if((str_in_length < PDF_TEXT_LCMINL) || \
1755 : (str_in[1] != PDF_TEXT_LCI_1) || \
1756 : (str_in[0] != PDF_TEXT_LCI_0))
1757 : {
1758 0 : return PDF_EBADDATA;
1759 : }
1760 :
1761 : /* Check last code marker position and MAXIMUM length of array.
1762 : * Additionally, set `str_out' and `str_out_length' */
1763 42 : if((str_in[5] != PDF_TEXT_LCI_1) || \
1764 : (str_in[4] != PDF_TEXT_LCI_0))
1765 : {
1766 : /* Check last marker in bytes 6 and 7... */
1767 14 : if((str_in_length >= PDF_TEXT_LCMAXL) && \
1768 : (str_in[7] == PDF_TEXT_LCI_1) && \
1769 : (str_in[6] == PDF_TEXT_LCI_0))
1770 : {
1771 14 : country_available = PDF_TRUE;
1772 14 : *str_out = (pdf_char_t *)str_in + PDF_TEXT_LCMAXL;
1773 14 : *str_out_length = str_in_length - PDF_TEXT_LCMAXL;
1774 : }
1775 : else
1776 : {
1777 : /* Either size is too short or last marker not found. This is a
1778 : * problem in the input data string */
1779 0 : return PDF_EBADDATA;
1780 : }
1781 : }
1782 : else
1783 : {
1784 : /* There is no optional country code info */
1785 14 : *str_out = (pdf_char_t *)str_in + PDF_TEXT_LCMINL;
1786 14 : *str_out_length = str_in_length - PDF_TEXT_LCMINL;
1787 : }
1788 :
1789 :
1790 : /* Store 2-bytes ISO 639 language code */
1791 28 : memcpy(&aux[0], &str_in[2], PDF_TEXT_CCL-1);
1792 28 : aux[PDF_TEXT_CCL-1] = '\0';
1793 28 : if(pdf_text_set_language(element, (pdf_char_t *)aux) != PDF_OK)
1794 : {
1795 0 : return PDF_ETEXTENC;
1796 : }
1797 :
1798 : /* If optional country code is also available, store it... */
1799 28 : if(country_available)
1800 : {
1801 14 : memcpy(&aux[0], &str_in[4], PDF_TEXT_CCL-1);
1802 : /* Last NUL byte is already set */
1803 : /* Store 2-bytes ISO 3166 country code */
1804 14 : if(pdf_text_set_country(element, (pdf_char_t *)aux) != PDF_OK)
1805 : {
1806 0 : return PDF_ETEXTENC;
1807 : }
1808 : }
1809 :
1810 28 : return PDF_OK;
1811 : }
1812 :
1813 : static enum pdf_text_unicode_encoding_e
1814 : pdf_text_transform_he_to_unicode_encoding(enum pdf_text_unicode_encoding_e enc)
1815 : {
1816 1121 : if((enc == PDF_TEXT_UTF16_HE) || \
1817 : (enc == PDF_TEXT_UTF32_HE))
1818 : {
1819 267 : enc += (PDF_IS_BIG_ENDIAN ? PDF_TEXT_HE_TO_BE : PDF_TEXT_HE_TO_LE);
1820 : }
1821 1121 : return enc;
1822 : }
1823 :
1824 :
1825 : static pdf_status_t
1826 : pdf_text_get_unicode_string_header(pdf_char_t header[PDF_TEXT_USHMAXL],
1827 : pdf_size_t *header_length,
1828 : const enum pdf_text_unicode_encoding_e enc,
1829 : const pdf_u32_t options,
1830 : const pdf_char_t *language,
1831 : const pdf_char_t *country)
1832 108 : {
1833 : short bom_bytes;
1834 : short lang_bytes;
1835 108 : pdf_text_bom_t bom = pdf_text_get_unicode_bom(enc);
1836 :
1837 : /* We know that these pointers will never be null if the function is only
1838 : * called by pdf_text_get_unicode, but just in case */
1839 108 : if((language == NULL) || \
1840 : (country == NULL) || \
1841 : (header_length == NULL))
1842 : {
1843 : PDF_DEBUG_BASE("Invalid pointers received");
1844 0 : return PDF_EBADDATA;
1845 : }
1846 :
1847 : /* Check if BOM really requested */
1848 108 : bom_bytes = 0;
1849 108 : if(options & PDF_TEXT_UNICODE_WITH_BOM)
1850 : {
1851 84 : bom_bytes = bom.bom_bytes;
1852 : }
1853 :
1854 : /* Check if Lang/Country code really requested (only for UTF16BE!!) */
1855 108 : lang_bytes = 0;
1856 108 : if((enc == PDF_TEXT_UTF16_BE) && \
1857 : (options & PDF_TEXT_UTF16BE_WITH_LANGCODE) && \
1858 : (strlen((char *)language) == 2))
1859 : {
1860 : /* At least language is available, but country may also be
1861 : * available */
1862 48 : lang_bytes = (strlen((char *)country) == 2) ? PDF_TEXT_LCMAXL: \
1863 : PDF_TEXT_LCMINL;
1864 : }
1865 :
1866 : /* Modify header array, if needed, to add Language/Country info and/or
1867 : * BOM */
1868 108 : *header_length = lang_bytes + bom_bytes;
1869 108 : if((*header_length > 0) && \
1870 : (*header_length < PDF_TEXT_USHMAXL)) /* (just in case) */
1871 : {
1872 : pdf_char_t *walker;
1873 108 : walker = &header[0];
1874 :
1875 : /* Add BOM */
1876 108 : if(bom_bytes > 0)
1877 : {
1878 84 : memcpy(walker, bom.bom_data, bom_bytes);
1879 : /* Update walker */
1880 84 : walker += bom_bytes;
1881 : }
1882 :
1883 : /* Add Lang/Country */
1884 108 : if(lang_bytes > 0)
1885 : {
1886 : /* Language and Country */
1887 48 : if(lang_bytes == PDF_TEXT_LCMAXL)
1888 : {
1889 24 : sprintf((char *)walker, "%c%c%2s%2s%c%c",
1890 : PDF_TEXT_LCI_0,PDF_TEXT_LCI_1,
1891 : language, country,
1892 : PDF_TEXT_LCI_0,PDF_TEXT_LCI_1);
1893 : }
1894 : /* Language only */
1895 : else
1896 : {
1897 24 : sprintf((char *)walker, "%c%c%2s%c%c",
1898 : PDF_TEXT_LCI_0,PDF_TEXT_LCI_1,
1899 : language,
1900 : PDF_TEXT_LCI_0,PDF_TEXT_LCI_1);
1901 : }
1902 : }
1903 : }
1904 108 : return PDF_OK;
1905 : }
1906 :
1907 :
1908 :
1909 : pdf_bool_t
1910 : pdf_text_is_ascii7(const pdf_char_t *utf8data, const pdf_size_t size)
1911 32 : {
1912 : pdf_size_t i;
1913 159 : for(i=0; i<size; ++i)
1914 : {
1915 : /* Just check the MSB. In ASCII-7 it must be 0 */
1916 127 : if(utf8data[i] & 0x80)
1917 : {
1918 0 : return PDF_FALSE;
1919 : }
1920 : }
1921 32 : return PDF_TRUE;
1922 : }
1923 :
1924 :
1925 : /* Generate Word Boundaries list from text object */
1926 : pdf_status_t
1927 : pdf_text_generate_word_boundaries(pdf_text_t text)
1928 54 : {
1929 108 : if(pdf_list_size(text->word_boundaries) == 0)
1930 : {
1931 54 : return pdf_text_fill_word_boundaries_list(text->word_boundaries,
1932 : text->data, text->size);
1933 : }
1934 : else
1935 : {
1936 : /* List already created */
1937 0 : return PDF_OK;
1938 : }
1939 : }
1940 :
1941 : pdf_status_t
1942 : pdf_text_destroy_word_boundaries_list(pdf_list_t *p_word_boundaries)
1943 818 : {
1944 : pdf_size_t n_words;
1945 : pdf_size_t i;
1946 : /* Walk list of words */
1947 1636 : n_words = pdf_list_size(*p_word_boundaries);
1948 1166 : for(i = 0; i < n_words; ++i)
1949 : {
1950 348 : struct pdf_text_wb_s *p_word = NULL;
1951 :
1952 : /* Get word to process from list of words */
1953 348 : if(pdf_list_get_at(*p_word_boundaries, \
1954 : i, \
1955 : (const void **)&p_word) != PDF_OK)
1956 : {
1957 0 : return PDF_ETEXTENC;
1958 : }
1959 : /* Dealloc word (pointed by the list element) */
1960 348 : pdf_dealloc(p_word);
1961 : }
1962 : /* Destroy list */
1963 818 : pdf_list_destroy(*p_word_boundaries);
1964 818 : return PDF_OK;
1965 : }
1966 :
1967 :
1968 : /* Create empty Word Boundaries list */
1969 : pdf_status_t
1970 : pdf_text_create_word_boundaries_list(pdf_list_t *p_word_boundaries)
1971 994 : {
1972 : pdf_list_t temp_list;
1973 : /* Initialize word boundaries list */
1974 994 : if(pdf_list_new (NULL, NULL, PDF_TRUE, &temp_list) != PDF_OK)
1975 : {
1976 0 : return PDF_ETEXTENC;
1977 : }
1978 :
1979 : /* Set output if everything went ok */
1980 994 : *p_word_boundaries = temp_list;
1981 994 : return PDF_OK;
1982 : }
1983 :
1984 : /* Clean (destroy and create empty) Word Boundaries list */
1985 : static pdf_status_t
1986 : pdf_text_clean_word_boundaries_list(pdf_list_t *p_word_boundaries)
1987 : {
1988 : /* Only destroy+create if list is not empty! */
1989 1382 : if(pdf_list_size(*p_word_boundaries) != 0)
1990 : {
1991 : /* Destroy element contents */
1992 0 : pdf_text_destroy_word_boundaries_list(p_word_boundaries);
1993 : /* Create empty list */
1994 0 : return pdf_text_create_word_boundaries_list(p_word_boundaries);
1995 : }
1996 : else
1997 : {
1998 : /* List is already empty */
1999 691 : return PDF_OK;
2000 : }
2001 : }
2002 :
2003 :
2004 : /* Fill in the Word Boundaries list using the given data */
2005 : static pdf_status_t
2006 : pdf_text_fill_word_boundaries_list(pdf_list_t word_boundaries,
2007 : const pdf_char_t *data,
2008 : const pdf_size_t size)
2009 58 : {
2010 : /* Perform a basic check of data length */
2011 58 : if(size % 4 != 0)
2012 : {
2013 0 : return PDF_EBADDATA;
2014 : }
2015 :
2016 : /* Only try to find word boundaries if length is greater than 0! */
2017 58 : if(size > 0)
2018 : {
2019 : pdf_char_t *walker;
2020 : pdf_size_t n_bytes_left;
2021 :
2022 : /* Initialize walker and number of bytes left */
2023 58 : walker = (pdf_char_t *)data;
2024 58 : n_bytes_left = size;
2025 :
2026 308 : while(n_bytes_left > 0)
2027 : {
2028 192 : struct pdf_text_wb_s *p_word = NULL;
2029 :
2030 : /* Allocate new word */
2031 192 : p_word = (struct pdf_text_wb_s *)pdf_alloc(sizeof(struct pdf_text_wb_s));
2032 192 : if(p_word == NULL)
2033 : {
2034 0 : return PDF_ENOMEM;
2035 : }
2036 :
2037 : /* RULE WB1: Break at the start of text ( SOT % ) */
2038 192 : p_word->word_start = walker;
2039 :
2040 192 : if(pdf_text_ucd_wb_detect_next(walker,
2041 : n_bytes_left,
2042 : &(p_word->word_stop),
2043 : &n_bytes_left)!= PDF_OK)
2044 : {
2045 0 : return PDF_ETEXTENC;
2046 : }
2047 :
2048 : /* Compute word size in bytes */
2049 192 : p_word->word_size = (p_word->word_stop - p_word->word_start) + 4;
2050 :
2051 : /* Add new word boundary to list */
2052 192 : pdf_list_add_last(word_boundaries, p_word, NULL);
2053 :
2054 : /* Update walker */
2055 192 : walker = p_word->word_stop + 4;
2056 : }
2057 : }
2058 :
2059 58 : return PDF_OK;
2060 : }
2061 :
2062 :
2063 : /* End of pdf-text.c */
|