LTP GCOV extension - code coverage report
Current view: directory - src/base - pdf-text.c
Test: libgnupdf.info
Date: 2010-07-31 Instrumented lines: 583
Code covered: 82.5 % Executed lines: 481

       1                 : /* -*- mode: C -*-
       2                 :  *
       3                 :  *       File:         pdf-text.c
       4                 :  *       Date:         Fri Jan 11 21:09:56 2008
       5                 :  *
       6                 :  *       GNU PDF Library - Encoded Text handling utilities
       7                 :  *
       8                 :  */
       9                 : 
      10                 : /* Copyright (C) 2008 Free Software Foundation, Inc. */
      11                 : 
      12                 : /* This program is free software: you can redistribute it and/or modify
      13                 :  * it under the terms of the GNU General Public License as published by
      14                 :  * the Free Software Foundation, either version 3 of the License, or
      15                 :  * (at your option) any later version.
      16                 :  *
      17                 :  * This program is distributed in the hope that it will be useful,
      18                 :  * but WITHOUT ANY WARRANTY; without even the implied warranty of
      19                 :  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
      20                 :  * GNU General Public License for more details.
      21                 :  *
      22                 :  * You should have received a copy of the GNU General Public License
      23                 :  * along with this program.  If not, see <http://www.gnu.org/licenses/>.
      24                 :  */
      25                 : 
      26                 : #include <config.h>
      27                 : 
      28                 : #include <limits.h>
      29                 : #include <stdlib.h>
      30                 : #include <string.h>
      31                 : #include <stdint.h>
      32                 : #ifdef HAVE_MALLOC_H
      33                 :   #include <malloc.h>
      34                 : #endif /* HAVE_MALLOC_H */
      35                 : #include <xalloc.h>
      36                 : #include <math.h>
      37                 : 
      38                 : #include <pdf-text.h>
      39                 : #include <pdf-text-encoding.h>
      40                 : #include <pdf-text-host-encoding.h>
      41                 : #include <pdf-text-context.h>
      42                 : #include <pdf-text-filter.h>
      43                 : #include <pdf-text-ucd.h>
      44                 : 
      45                 : 
      46                 : /* Lang/Country Minimum Length, in bytes, of the Lang/Country information within
      47                 :  *  a UTF16BEstring (2bytes for the first marker, 2 bytes for LANG and 2 bytes
      48                 :  *  for the last marker). */
      49                 : #define PDF_TEXT_LCMINL     6
      50                 : 
      51                 : /* Maximum size, in bytes, of the Lang/Country information within a UTF16BE
      52                 :  *  string (Minimum size + 2 bytes for COUNTRY). */
      53                 : #define PDF_TEXT_LCMAXL     8
      54                 : 
      55                 : 
      56                 : /* Longest header length when requesting a unicode string with options is that
      57                 :  * of UTF-16BE with BOM and lang/country information: 2bytes-BOM +
      58                 :  * 8bytes-lang/country = 10 bytes (+ 1 NUL byte) */
      59                 : #define PDF_TEXT_USHMAXL    11
      60                 : 
      61                 : /* ---------------- Static (private) functions prototypes ------------------- */
      62                 : 
      63                 : 
      64                 : /* This function receives as input a valid pdf_text_t element, where the
      65                 :  *  language and country code informations will be stored. In addition to this,
      66                 :  *  the function receives as input the data string (starting in the first
      67                 :  *  marker), and stores a pointer to the continuation of the data string, after
      68                 :  *  having read the language/country information. This function really assumes
      69                 :  *  that the input data string contains in the first bytes the country/lang
      70                 :  *  information.
      71                 :  * Two options are possible:
      72                 :  *   XXllXX   (6 bytes, XX is the marker, ll the language)
      73                 :  *   XXllccXX (8 bytes, XX is the marker, ll the language and cc the country)
      74                 :  */
      75                 : static pdf_status_t
      76                 : pdf_text_get_lang_from_utf16be(pdf_text_t element,
      77                 :                                pdf_char_t **str_out,
      78                 :                                pdf_size_t *str_out_length,
      79                 :                                const pdf_char_t *str_in,
      80                 :                                const pdf_size_t str_in_length);
      81                 : 
      82                 : /* Function to get the header of a unicode string as requested in the
      83                 :  * `options' field when calling `pdf_text_get_unicode'. The header can be:
      84                 :  *  - BOM
      85                 :  *  - BOM + Lang/Country info (only if UTF-16BE requested)
      86                 :  *  - Lang/Country info (only if UTF-16BE requested)
      87                 :  */
      88                 : static pdf_status_t
      89                 : pdf_text_get_unicode_string_header(pdf_char_t header[PDF_TEXT_USHMAXL],
      90                 :                                    pdf_size_t *header_length,
      91                 :                                    const enum pdf_text_unicode_encoding_e enc,
      92                 :                                    const pdf_u32_t options,
      93                 :                                    const pdf_char_t *language,
      94                 :                                    const pdf_char_t *country);
      95                 : 
      96                 : /* Function to convert a given Unicode Host Endian enumeration to the `real'
      97                 :  *  endianness (BE or LE). If a non-HE enumeration is passed to the function,
      98                 :  *  it will return the same enumeration value unchanged */
      99                 : static enum pdf_text_unicode_encoding_e
     100                 : pdf_text_transform_he_to_unicode_encoding(enum pdf_text_unicode_encoding_e enc);
     101                 : 
     102                 : /* Function to compare two given words */
     103                 : static pdf_i32_t
     104                 : pdf_text_compare_words(const pdf_char_t *word1,
     105                 :                        const pdf_size_t size1,
     106                 :                        const pdf_char_t *word2,
     107                 :                        const pdf_size_t size2,
     108                 :                        const pdf_char_t *language1,
     109                 :                        const pdf_char_t *language2,
     110                 :                        pdf_status_t *p_ret_code);
     111                 : 
     112                 : /* Non-Case sensitive comparison of text objects */
     113                 : static pdf_i32_t
     114                 : pdf_text_cmp_non_case_sensitive(pdf_text_t text1,
     115                 :                                 pdf_text_t text2,
     116                 :                                 pdf_status_t *p_ret_code);
     117                 : 
     118                 : /* Clean (destroy and create empty) Word Boundaries list */
     119                 : static pdf_status_t
     120                 : pdf_text_clean_word_boundaries_list(pdf_list_t *p_word_boundaries);
     121                 : /* Fill in the Word Boundaries list using the given data */
     122                 : static pdf_status_t
     123                 : pdf_text_fill_word_boundaries_list(pdf_list_t word_boundaries,
     124                 :                                    const pdf_char_t *data,
     125                 :                                    const pdf_size_t size);
     126                 : 
     127                 : 
     128                 : /* ----------------------------- Public functions ----------------------------*/
     129                 : 
     130                 : 
     131                 : 
     132                 : pdf_status_t
     133                 : pdf_text_init(void)
     134             735 : {
     135                 :   /* Initiate Text module context */
     136             735 :   return pdf_text_context_init();
     137                 : }
     138                 : 
     139                 : 
     140                 : pdf_status_t
     141                 : pdf_text_new (pdf_text_t *text)
     142             941 : {
     143                 :   /* The text global state should be initialized! */
     144             941 :   if (pdf_text_context_initialized () == PDF_FALSE)
     145                 :     {
     146               1 :       return PDF_EBADCONTEXT;
     147                 :     }
     148                 : 
     149                 :   /* Allocate memory for the new text structure */
     150             940 :   *text = (pdf_text_t) pdf_alloc (sizeof(struct pdf_text_s));
     151             940 :   if (*text == NULL)
     152                 :     {
     153                 :       /* Out of memory condition */
     154               0 :       return PDF_ENOMEM;
     155                 :     }
     156                 : 
     157                 :   /* Initialize all contents */
     158             940 :   (*text)->data = NULL;
     159             940 :   (*text)->size = 0;
     160             940 :   (*text)->printable = NULL;
     161             940 :   (*text)->modified = PDF_FALSE;
     162             940 :   memset(&((*text)->lang[0]), 0, PDF_TEXT_CCL);
     163             940 :   memset(&((*text)->country[0]), 0, PDF_TEXT_CCL);
     164                 : 
     165                 :   /* Create empty word boundaries list */
     166             940 :   if(pdf_text_create_word_boundaries_list(&((*text)->word_boundaries)) != \
     167                 :      PDF_OK)
     168                 :     {
     169               0 :       pdf_dealloc(*text);
     170               0 :       *text = NULL;
     171                 :     }
     172                 : 
     173                 :   /* Success! */
     174             940 :   return PDF_OK;
     175                 : }
     176                 : 
     177                 : 
     178                 : pdf_status_t
     179                 : pdf_text_destroy (pdf_text_t text)
     180             764 : {
     181                 :   /* Dealloc memory */
     182             764 :   if(text->data != NULL)
     183                 :     {
     184             664 :       pdf_dealloc(text->data);
     185             664 :       text->data = NULL;
     186                 :     }
     187                 : 
     188             764 :   if (text->printable != NULL)
     189                 :     {
     190               0 :       pdf_dealloc (text->printable);
     191                 :     }
     192                 : 
     193                 :   /* Destroy word boundaries list */
     194             764 :   pdf_text_destroy_word_boundaries_list(&text->word_boundaries);
     195                 : 
     196                 :   /* Finally, clear full structure */
     197             764 :   pdf_dealloc(text);
     198                 : 
     199             764 :   return PDF_OK;
     200                 : }
     201                 : 
     202                 : 
     203                 : pdf_text_t
     204                 : pdf_text_dup (const pdf_text_t text)
     205              31 : {
     206                 :   pdf_text_t element;
     207                 : 
     208              31 :   if (text == NULL)
     209                 :     {
     210               1 :       return NULL;
     211                 :     }
     212                 : 
     213                 :   /* Allocate and initialize element */
     214              30 :   if (pdf_text_new (&element) == PDF_OK)
     215                 :     {
     216                 :       /* Duplicate size */
     217              30 :       element->size = text->size;
     218                 : 
     219                 :       /* Duplicate contents (if size > 0) */
     220              30 :       if(element->size > 0)
     221                 :         {
     222              29 :           element->data = (pdf_char_t *) pdf_alloc (element->size);
     223              29 :           if(element->data != NULL)
     224                 :             {
     225              29 :               memcpy(element->data, text->data, (size_t)element->size);
     226                 :             }
     227                 :         }
     228                 : 
     229                 :       /* Duplicate Language code and Country code (if available) */
     230              30 :       memcpy(element->lang, text->lang, (size_t) PDF_TEXT_CCL);
     231              30 :       memcpy(element->country, text->country, (size_t) PDF_TEXT_CCL);
     232                 : 
     233                 :       /* We don't really need to duplicate the contents of the word
     234                 :        *  boundaries list, as it is a side product, same with printable */
     235                 : 
     236                 :       /* Set output element...*/
     237              30 :       return element;
     238                 :     }
     239                 :   else
     240                 :     {
     241                 :       /* Dup failed */
     242               0 :       return NULL;
     243                 :     }
     244                 : }
     245                 : 
     246                 : 
     247                 : 
     248                 : pdf_status_t
     249                 : pdf_text_new_from_host (const pdf_char_t *str,
     250                 :                         const pdf_size_t size,
     251                 :                         const pdf_text_host_encoding_t enc,
     252                 :                         pdf_text_t *text)
     253               7 : {
     254               7 :   pdf_text_t element = NULL;
     255               7 :   pdf_status_t ret_code = PDF_ETEXTENC;
     256                 :   pdf_status_t ret_code_new;
     257                 : 
     258               7 :   if((str == NULL) || \
     259                 :      (size == 0))
     260                 :     {
     261               2 :       return PDF_EBADDATA;
     262                 :     }
     263                 : 
     264                 :   /* Allocate and initialize element */
     265               5 :   ret_code_new = pdf_text_new (&element);
     266               5 :   if (ret_code_new != PDF_OK)
     267                 :     {
     268                 :       /* Oops, element creation failed due to an error... */
     269               0 :       return ret_code_new;
     270                 :     }
     271                 : 
     272                 :   /* Set Host Encoding contents */
     273               5 :   ret_code = pdf_text_set_host(element, str, size, enc);
     274                 : 
     275               5 :   if(ret_code == PDF_OK)
     276                 :     {
     277                 :       /* Perfect! Set output variable */
     278               3 :       *text = element;
     279                 :     }
     280                 :   else
     281                 :     {
     282                 :       /* Conversion went wrong... so destroy object contents */
     283               2 :       pdf_text_destroy(element);
     284                 :     }
     285                 : 
     286                 :   /* Return status of the conversion */
     287               5 :   return ret_code;
     288                 : }
     289                 : 
     290                 : 
     291                 : 
     292                 : pdf_status_t
     293                 : pdf_text_new_from_pdf_string (const pdf_char_t *str,
     294                 :                               const pdf_size_t size,
     295                 :                               pdf_char_t **remaining_str,
     296                 :                               pdf_size_t *remaining_length,
     297                 :                               pdf_text_t *text)
     298              97 : {
     299              97 :   pdf_status_t ret_code = PDF_ETEXTENC;
     300                 :   pdf_status_t ret_code_new;
     301              97 :   pdf_text_t element = NULL;
     302              97 :   short bom_found = 0;
     303              97 :   short lang_found = 0;
     304                 : 
     305              97 :   if(str == NULL)
     306                 :     {
     307               0 :       return PDF_EBADDATA;
     308                 :     }
     309                 : 
     310                 :   /* Allocate and initialize element */
     311              97 :   ret_code_new = pdf_text_new (&element);
     312              97 :   if (ret_code_new != PDF_OK)
     313                 :     {
     314                 :       /* Oops, element creation failed due to some error... */
     315               0 :       return ret_code_new;
     316                 :     }
     317                 : 
     318                 :   /* First of all, check first two bytes to detect UTF-16BE BOM or lang/country
     319                 :    *  code initializer.
     320                 :    *  If length of the text is less than 2, then we can assume it is encoded in
     321                 :    *  PDF Doc Encoding */
     322              97 :   if(size >= 2)
     323                 :     {
     324                 :       /* Check Unicode Byte Order Marker encoded in UTF-16BE */
     325              92 :       if(pdf_text_check_unicode_bom(str, size, PDF_TEXT_UTF16_BE, 0))
     326                 :         {
     327              21 :           bom_found = 1;
     328                 :           /* Check Lang/Country Code initializer */
     329              21 :           if((size >= 4) && \
     330                 :              (str[3] == PDF_TEXT_LCI_1) && \
     331                 :              (str[2] == PDF_TEXT_LCI_0))
     332                 :             {
     333              16 :               lang_found = 1;
     334                 :             }
     335                 :         }
     336                 :       /* Check Lang/Country Code initializer (if this is the nth call to the
     337                 :        *  function parsing a single UTF-16BE string.*/
     338              71 :       else if((str[1] == PDF_TEXT_LCI_1) && \
     339                 :               (str[0] == PDF_TEXT_LCI_0))
     340                 :         {
     341              12 :           lang_found = 1;
     342                 :         }
     343                 :     }
     344                 : 
     345                 :   /* If either BOM or Lang Marker are found, process PDF string as encoded
     346                 :    *  in UTF16-BE */
     347              97 :   if(bom_found || lang_found)
     348                 :     {
     349              33 :       pdf_char_t *string_start = (pdf_char_t *)str;
     350              33 :       pdf_size_t string_length = size;
     351                 : 
     352                 :       /* Skip 2-bytes BOM */
     353              33 :       if(bom_found)
     354                 :         {
     355              21 :           string_start += 2;
     356              21 :           string_length -= 2;
     357                 :         }
     358                 : 
     359                 :       /* If lang/country code available, obtain and store the information */
     360              33 :       if((lang_found) && \
     361                 :          (pdf_text_get_lang_from_utf16be(element,
     362                 :                                          &string_start, &string_length,
     363                 :                                          string_start, string_length)!=PDF_OK))
     364                 :         {
     365                 :           PDF_DEBUG_BASE("Invalid Lang/Code info detected");
     366               0 :           pdf_text_destroy(element);
     367               0 :           return PDF_ETEXTENC;
     368                 :         }
     369                 : 
     370                 :       /* And finally convert to UTF-32... */
     371              33 :       ret_code = pdf_text_utf16be_to_utf32he(string_start,
     372                 :                                              string_length,
     373                 :                                              &(element->data),
     374                 :                                              &(element->size),
     375                 :                                              remaining_str,
     376                 :                                              remaining_length);
     377                 :     }
     378                 :   /* Else, process PDF string as encoded in PDF Doc Encoding */
     379                 :   else
     380                 :     {
     381                 :       /* We already know that this string will be fully stored, without
     382                 :        *  splitting in chunks */
     383              64 :       if(remaining_length != NULL)
     384                 :         {
     385              24 :           *remaining_length = 0;
     386                 :         }
     387              64 :       if(remaining_str != NULL)
     388                 :         {
     389              24 :           *remaining_str = NULL;
     390                 :         }
     391                 :       /* And perform the conversion */
     392              64 :       ret_code = pdf_text_pdfdocenc_to_utf32he(str,
     393                 :                                                size,
     394                 :                                                &(element->data),
     395                 :                                                &(element->size));
     396                 :     }
     397                 : 
     398                 :   /* Only store in the output element if and only if everything went ok */
     399              97 :   if(ret_code == PDF_OK)
     400                 :     {
     401              86 :       *text = element;
     402                 :     }
     403                 :   else
     404                 :     {
     405              11 :       pdf_text_destroy(element);
     406                 :     }
     407              97 :   return ret_code;
     408                 : }
     409                 : 
     410                 : 
     411                 : pdf_status_t
     412                 : pdf_text_new_from_unicode (const pdf_char_t *str,
     413                 :                            const pdf_size_t size,
     414                 :                            const enum pdf_text_unicode_encoding_e enc,
     415                 :                            pdf_text_t *text)
     416             539 : {
     417             539 :   pdf_text_t element = NULL;
     418             539 :   pdf_status_t ret_code = PDF_OK;
     419                 :   pdf_status_t ret_code_new;
     420                 : 
     421             539 :   if(str == NULL)
     422                 :     {
     423               0 :       return PDF_EBADDATA;
     424                 :     }
     425                 : 
     426                 :   /* Allocate and initialize element */
     427             539 :   ret_code_new = pdf_text_new (&element);
     428             539 :   if (ret_code_new != PDF_OK)
     429                 :     {
     430                 :       /* Oops, element creation failed due to some error... */
     431               0 :       return ret_code_new;
     432                 :     }
     433                 : 
     434                 :   /* Set Unicode contents */
     435             539 :   if(size > 0)
     436                 :     {
     437             530 :       ret_code = pdf_text_set_unicode(element, str, size, enc);
     438                 :     }
     439                 : 
     440             539 :   if(ret_code == PDF_OK)
     441                 :     {
     442                 :       /* Perfect! Set output variable */
     443             499 :       *text = element;
     444                 :     }
     445                 :   else
     446                 :     {
     447                 :       /* Conversion went wrong... so destroy object contents */
     448              40 :       pdf_text_destroy(element);
     449                 :     }
     450                 : 
     451                 :   /* Return status of the conversion */
     452             539 :   return ret_code;
     453                 : }
     454                 : 
     455                 : 
     456                 : pdf_status_t
     457                 : pdf_text_new_from_u32 (const pdf_u32_t number,
     458                 :                        pdf_text_t *text)
     459               2 : {
     460                 :   /* Longest number to hold in 32bit: 2^32 = 4294967296 (10 chars) */
     461                 :   pdf_char_t temp[10 + 1];
     462                 :   pdf_size_t n;
     463                 : 
     464                 :   /* Print number in temporal char array, and get number of output chars */
     465               2 :   n = sprintf((char *)&temp[0],"%u",(unsigned int)number);
     466                 : 
     467                 :   /* At least one char should have been printed! */
     468               2 :   if(n > 0)
     469                 :     {
     470                 :       /* Treat the generated string as UTF-8 encoded (just numbers in ASCII) */
     471               2 :       return pdf_text_new_from_unicode (&temp[0], n, PDF_TEXT_UTF8, text);
     472                 :     }
     473                 :   else
     474                 :     {
     475                 :       PDF_DEBUG_BASE("Invalid u32 received: %u", (unsigned int)number);
     476               0 :       return PDF_EBADTEXT;
     477                 :     }
     478                 : }
     479                 : 
     480                 : 
     481                 : /* Return the country associated with a text variable */
     482                 : const pdf_char_t *
     483                 : pdf_text_get_country (const pdf_text_t text)
     484             154 : {
     485             262 :   return (const pdf_char_t *)text->country;
     486                 : }
     487                 : 
     488                 : /* Return the language associated with a text variable */
     489                 : const pdf_char_t *
     490                 : pdf_text_get_language (const pdf_text_t text)
     491             222 : {
     492             348 :   return (const pdf_char_t *)text->lang;
     493                 : }
     494                 : 
     495                 : /* Associate a text variable (full text) with a country code */
     496                 : pdf_status_t
     497                 : pdf_text_set_country (pdf_text_t text,
     498                 :                       const pdf_char_t *code)
     499             154 : {
     500             154 :   if((code == NULL) || \
     501                 :      (strlen((char *)code) != (PDF_TEXT_CCL-1)))
     502                 :     {
     503               2 :       return PDF_EBADDATA;
     504                 :     }
     505                 : 
     506             152 :   memcpy(&(text->country[0]), code, PDF_TEXT_CCL-1);
     507                 :   /* Make sure that last byte is NUL */
     508             152 :   text->country[PDF_TEXT_CCL-1] = '\0';
     509             152 :   return PDF_OK;
     510                 : }
     511                 : 
     512                 : 
     513                 : /* Associate a text variable (full text) with a language code */
     514                 : pdf_status_t
     515                 : pdf_text_set_language (pdf_text_t text,
     516                 :                        const pdf_char_t *code)
     517             301 : {
     518             301 :   if((code == NULL) || \
     519                 :      (strlen((char *)code) != (PDF_TEXT_CCL-1)))
     520                 :     {
     521               2 :       return PDF_EBADDATA;
     522                 :     }
     523                 : 
     524             299 :   memcpy(&(text->lang[0]), code, PDF_TEXT_CCL-1);
     525                 :   /* Make sure that last byte is NUL */
     526             299 :   text->lang[PDF_TEXT_CCL-1] = '\0';
     527             299 :   return PDF_OK;
     528                 : }
     529                 : 
     530                 : 
     531                 : /* Determine if a given text variable is empty (contains no text) */
     532                 : inline pdf_bool_t
     533                 : pdf_text_empty_p (const pdf_text_t text)
     534               5 : {
     535              45 :   return ((text->size != 0) ? PDF_FALSE : PDF_TRUE);
     536                 : }
     537                 : 
     538                 : 
     539                 : /* Get default system host encoding */
     540                 : pdf_text_host_encoding_t
     541                 : pdf_text_get_host_encoding(void)
     542              28 : {
     543              28 :   return pdf_text_context_get_host_encoding();
     544                 : }
     545                 : 
     546                 : 
     547                 : /* Check if host encoding is available */
     548                 : pdf_status_t
     549                 : pdf_text_check_host_encoding(const pdf_char_t *encoding_name,
     550                 :                              pdf_text_host_encoding_t *p_encoding)
     551               4 : {
     552                 :   /* Check length of host encoding */
     553               4 :   if(strlen((char *)encoding_name) >= PDF_TEXT_HENMAXL)
     554                 :     {
     555                 :       PDF_DEBUG_BASE("Encoding name too long!");
     556               0 :       return PDF_EBADDATA;
     557                 :     }
     558                 : 
     559               4 :   if(pdf_text_host_encoding_is_available(encoding_name) == PDF_OK)
     560                 :     {
     561               3 :       strcpy((char *)(&(p_encoding->name[0])), (char *)encoding_name);
     562               3 :       p_encoding->name[strlen((char *)encoding_name)-1] = '\0';
     563               3 :       return PDF_OK;
     564                 :     }
     565                 :   else
     566                 :     {
     567               1 :       return PDF_ETEXTENC;
     568                 :     }
     569                 : }
     570                 : 
     571                 : 
     572                 : pdf_text_host_encoding_t
     573                 : pdf_text_get_best_encoding (pdf_text_t text,
     574                 :                             const pdf_text_host_encoding_t preferred_encoding)
     575               1 : {
     576                 :   pdf_text_host_encoding_t ret_encoding;
     577                 : #ifdef PDF_HOST_WIN32
     578                 :   static const pdf_char_t *to_check [3] = {
     579                 :     (pdf_char_t *) "CP65001", /* UTF-8 */
     580                 :     (pdf_char_t *) "CP1200",  /* UTF-16LE */
     581                 :     (pdf_char_t *) "CP12000"   /* UTF-32LE */
     582                 :   };
     583                 : #else
     584                 :   static const pdf_char_t *to_check [3] = {
     585                 :     (pdf_char_t *) "UTF-8",
     586                 :     (pdf_char_t *) "UTF-16",
     587                 :     (pdf_char_t *) "UTF-32"
     588                 :   };
     589                 : 
     590                 : #endif
     591               1 :   int i = 0;
     592                 :   /* Check for Unicode support as host encoding */
     593               1 :   for(i = 0; i<3; i++)
     594                 :     {
     595               1 :       if(pdf_text_check_host_encoding(to_check[i], &ret_encoding) == PDF_OK)
     596                 :         {
     597               1 :           return ret_encoding;
     598                 :         }
     599                 :     }
     600                 :   /* If host does not support any Unicode encoding conversion, return the
     601                 :    *  preferred one directly */
     602               0 :   return preferred_encoding;
     603                 : }
     604                 : 
     605                 : 
     606                 : 
     607                 : pdf_status_t
     608                 : pdf_text_get_host (pdf_char_t **contents,
     609                 :                    pdf_size_t *length,
     610                 :                    const pdf_text_t text,
     611                 :                    const pdf_text_host_encoding_t enc)
     612              33 : {
     613                 : 
     614              33 :   return pdf_text_utf32he_to_host (text->data, text->size, enc,
     615                 :                                    contents, length);
     616                 : }
     617                 : 
     618                 : 
     619                 : /* Get the contents of a text variable encoded in PDFDocEncoding, as a NUL
     620                 :  *  terminated string */
     621                 : pdf_status_t
     622                 : pdf_text_get_pdfdocenc (pdf_char_t **contents,
     623                 :                         const pdf_text_t text)
     624             119 : {
     625                 :   pdf_status_t ret_code;
     626             119 :   pdf_char_t *data = NULL;
     627             119 :   pdf_size_t size = -1;
     628                 : 
     629             119 :   ret_code = pdf_text_utf32he_to_pdfdocenc(text->data, text->size,
     630                 :                                            &data, &size);
     631                 : 
     632                 :   /* Now, if conversion went ok... */
     633             119 :   if(ret_code == PDF_OK)
     634                 :     {
     635                 :       /* Add NUL character at the end of the array */
     636             119 :       data = pdf_realloc(data, size+1);
     637             119 :       if(data != NULL)
     638                 :         {
     639             119 :           data[size] = '\0';
     640                 :           /* Set output data... */
     641             119 :           *contents = data;
     642                 :         }
     643                 :       else
     644                 :         {
     645               0 :           return PDF_ENOMEM;
     646                 :         }
     647                 :     }
     648                 :   /* else, clear allocated memory, if any */
     649               0 :   else if(data != NULL)
     650                 :     {
     651               0 :       pdf_dealloc(data);
     652                 :     }
     653                 : 
     654             119 :   return ret_code;
     655                 : }
     656                 : 
     657                 : 
     658                 : pdf_status_t
     659                 : pdf_text_get_unicode (pdf_char_t **contents,
     660                 :                       pdf_size_t *length,
     661                 :                       const pdf_text_t text,
     662                 :                       const enum pdf_text_unicode_encoding_e enc,
     663                 :                       const pdf_u32_t options)
     664             592 : {
     665                 :   pdf_status_t ret_code;
     666                 :   enum pdf_text_unicode_encoding_e new_enc;
     667             592 :   pdf_char_t *out_data = NULL;
     668             592 :   pdf_size_t out_length = 0;
     669                 : 
     670                 :   /* Check for invalid options... */
     671             592 :   if((options & PDF_TEXT_UTF16BE_WITH_LANGCODE) && \
     672                 :      (enc != PDF_TEXT_UTF16_BE))
     673                 :     {
     674                 :       PDF_DEBUG_BASE("Lang/Country info only available for UTF-16BE");
     675                 :       /* Not allowed!!! */
     676             164 :       return PDF_EBADDATA;
     677                 :     }
     678                 : 
     679                 :   /* If host endianness required, check it and convert input encoding */
     680             428 :   new_enc = pdf_text_transform_he_to_unicode_encoding(enc);
     681                 : 
     682                 :   /* If text is empty, set empty string */
     683             460 :   if((text->data == NULL) || \
     684                 :      (text->size == 0))
     685                 :     {
     686              32 :       out_data = NULL;
     687              32 :       out_length = 0;
     688              32 :       ret_code = PDF_OK;
     689                 :     }
     690                 :   else
     691                 :     {
     692                 :       /* Perform conversion */
     693             396 :       switch(new_enc)
     694                 :       {
     695                 :         case PDF_TEXT_UTF8: /* UTF-8 */
     696              23 :           ret_code = pdf_text_utf32he_to_utf8(text->data, text->size,
     697                 :                                               &out_data, &out_length);
     698              23 :           break;
     699                 :         case PDF_TEXT_UTF16_LE: /* UTF-16LE */
     700              20 :           ret_code = pdf_text_utf32he_to_utf16le(text->data, text->size,
     701                 :                                                  &out_data, &out_length);
     702              20 :           break;
     703                 :         case PDF_TEXT_UTF16_BE: /* UTF-16BE */
     704              60 :           ret_code = pdf_text_utf32he_to_utf16be(text->data, text->size,
     705                 :                                                  &out_data, &out_length);
     706              60 :           break;
     707                 :         case PDF_TEXT_UTF32_LE: /* UTF-32LE */
     708             217 :           ret_code = pdf_text_utf32he_to_utf32le(text->data, text->size,
     709                 :                                                  &out_data, &out_length);
     710             217 :           break;
     711                 :         case PDF_TEXT_UTF32_BE: /* UTF-32BE */
     712              76 :           ret_code = pdf_text_utf32he_to_utf32be(text->data, text->size,
     713                 :                                                  &out_data, &out_length);
     714              76 :           break;
     715                 :         default:
     716               0 :           ret_code = PDF_ETEXTENC;
     717                 :       }
     718                 :     }
     719                 : 
     720                 :   /* Check if specific options were requested */
     721             428 :   if(options != PDF_TEXT_UNICODE_NO_OPTION)
     722                 :     {
     723                 :       pdf_char_t header[PDF_TEXT_USHMAXL];
     724             138 :       pdf_size_t header_size = 0;
     725             138 :       pdf_size_t trailer_size = 0;
     726                 : 
     727                 :       /* Compute header if needed */
     728             138 :       if((options &  PDF_TEXT_UNICODE_WITH_BOM) || \
     729                 :          (options &  PDF_TEXT_UTF16BE_WITH_LANGCODE))
     730                 :         {
     731                 :           /* Clear header array */
     732             108 :           memset(&(header[0]), 0, PDF_TEXT_USHMAXL);
     733                 :           /* Get requested header (BOM and/or lang/country info) */
     734             108 :           pdf_text_get_unicode_string_header(header,
     735                 :                                              &header_size,
     736                 :                                              new_enc,
     737                 :                                              options,
     738                 :                                              pdf_text_get_language(text),
     739                 :                                              pdf_text_get_country(text));
     740                 :         }
     741                 :       /* Compute trailer if needed */
     742             138 :       if(options & PDF_TEXT_UNICODE_WITH_NUL_SUFFIX)
     743                 :         {
     744              84 :           switch(new_enc)
     745                 :             {
     746                 :               case PDF_TEXT_UTF8:
     747              12 :                   trailer_size = 1;
     748              12 :                   break;
     749                 :               case PDF_TEXT_UTF16_BE:
     750                 :               case PDF_TEXT_UTF16_LE:
     751                 :               case PDF_TEXT_UTF16_HE:
     752              48 :                   trailer_size = 2;
     753              48 :                   break;
     754                 :               case PDF_TEXT_UTF32_BE:
     755                 :               case PDF_TEXT_UTF32_LE:
     756                 :               case PDF_TEXT_UTF32_HE:
     757              24 :                   trailer_size = 4;
     758              24 :                   break;
     759                 :               default:
     760               0 :                   trailer_size = 0;
     761                 :                   break;
     762                 :             }
     763                 :         }
     764                 : 
     765             138 :       if((header_size > 0) || \
     766                 :          (trailer_size > 0))
     767                 :         {
     768             138 :           pdf_char_t *new_out_data = NULL;
     769                 : 
     770                 :           /* Allocate memory for new string */
     771             138 :           new_out_data = (pdf_char_t *)pdf_alloc(out_length + \
     772                 :                                                  header_size + \
     773                 :                                                  trailer_size);
     774             138 :           if(new_out_data == NULL)
     775                 :             {
     776               0 :               return PDF_ENOMEM;
     777                 :             }
     778                 :           /* Store header */
     779             138 :           memcpy(new_out_data, &header[0], header_size);
     780                 : 
     781             138 :           if((out_data != NULL) && \
     782                 :              (out_length != 0))
     783                 :             {
     784                 :               /* Store unicode data, if any */
     785             115 :               memcpy(&new_out_data[header_size], out_data, out_length);
     786                 :               /* Reset output data array, if any */
     787             115 :               pdf_dealloc(out_data);
     788                 :             }
     789                 : 
     790                 :           /* Store trailer (N-byte NUL) */
     791             138 :           if(trailer_size > 0)
     792                 :             {
     793              84 :               memset(&new_out_data[out_length+header_size],0,trailer_size);
     794                 :             }
     795                 : 
     796             138 :           out_data = new_out_data;
     797             138 :           out_length += (header_size + trailer_size);
     798                 :         }
     799                 :       else
     800                 :         {
     801                 :           PDF_DEBUG_BASE("Invalid unicode option requested (%u)",
     802                 :                          (unsigned int)options);
     803                 :         }
     804                 :     }
     805                 : 
     806                 :   /* Only store in the output element if and only if everything went ok */
     807             428 :   if(ret_code == PDF_OK)
     808                 :     {
     809             428 :       *contents = out_data;
     810             428 :       *length = out_length;
     811                 :     }
     812               0 :   else if(out_data != NULL)
     813                 :     {
     814               0 :       pdf_dealloc(out_data);
     815                 :     }
     816             428 :   return ret_code;
     817                 : }
     818                 : 
     819                 : 
     820                 : pdf_char_t *
     821                 : pdf_text_get_hex (const pdf_text_t text,
     822                 :                   const pdf_char_t delimiter)
     823               2 : {
     824                 :   int i;
     825                 :   int j;
     826                 :   unsigned int new_str_length;
     827                 :   pdf_char_t *new_str;
     828                 :   char new_hex_char [3];
     829                 : 
     830               2 :   if(text->size > 0)
     831                 :     {
     832                 :       /* Get new string length. If input string has N bytes, we need:
     833                 :        * - 1 byte for last NUL char
     834                 :        * - 2N bytes for hexadecimal char representation of each byte...
     835                 :        * - N-1 bytes for the separator ':'
     836                 :        * So... a total of (1+2N+N-1) = 3N bytes are needed... */
     837               1 :       new_str_length =  3 * text->size;
     838                 : 
     839                 :       /* Allocate memory for new array and initialize contents to NUL */
     840               1 :       new_str = (pdf_char_t *)pdf_alloc(new_str_length);
     841               1 :       if(new_str != NULL)
     842                 :         {
     843               1 :           memset(new_str, 0, new_str_length);
     844                 : 
     845                 :           /* Print hexadecimal representation of each byte... */
     846               9 :           for(i=0, j=0; i<text->size; i++, j+=3)
     847                 :             {
     848                 :               /* Clear helper array... */
     849               8 :               memset(&new_hex_char[0], 0, 3);
     850                 :               /* Print character in helper array... */
     851               8 :               sprintf( new_hex_char, "%02X", (unsigned int)text->data[i]);
     852                 :               /* Copy to output string... */
     853               8 :               memcpy(&new_str[j],&new_hex_char[0],2);
     854                 :               /* And if needed, add separator */
     855               8 :               if(i != (text->size-1) )
     856                 :                 {
     857               7 :                   new_str[j+2] = delimiter;
     858                 :                 }
     859                 :             }
     860                 :         }
     861                 :     }
     862                 :   else
     863                 :     {
     864               1 :       new_str = (pdf_char_t *)pdf_alloc(1);
     865               1 :       if(new_str != NULL)
     866                 :         {
     867               1 :           new_str[0] = '\0';
     868                 :         }
     869                 :     }
     870                 :   /* Set output string */
     871               2 :   return new_str;
     872                 : }
     873                 : 
     874                 : 
     875                 : pdf_status_t
     876                 : pdf_text_set_host (pdf_text_t text,
     877                 :                    const pdf_char_t *str,
     878                 :                    const pdf_size_t size,
     879                 :                    const pdf_text_host_encoding_t enc)
     880              10 : {
     881                 :   pdf_status_t ret_code;
     882                 :   pdf_char_t *temp_data;
     883                 :   pdf_size_t temp_size;
     884                 : 
     885              10 :   if(str == NULL)
     886                 :     {
     887               0 :       return PDF_EBADDATA;
     888                 :     }
     889                 : 
     890              10 :   ret_code = pdf_text_host_to_utf32he (str, size, enc,
     891                 :                                        &temp_data, &temp_size);
     892              10 :   if(ret_code == PDF_OK)
     893                 :     {
     894                 :       /* Destroy previous contents of text variable, if any */
     895               6 :       pdf_text_clean_contents(text);
     896                 : 
     897                 :       /* Really set contents */
     898               6 :       text->data = temp_data;
     899               6 :       text->size = temp_size;
     900                 :     }
     901              10 :   return ret_code;
     902                 : }
     903                 : 
     904                 : 
     905                 : /* Set PDF Doc Endoded string */
     906                 : pdf_status_t
     907                 : pdf_text_set_pdfdocenc (pdf_text_t text,
     908                 :                         const pdf_char_t *str)
     909             113 : {
     910                 :   pdf_status_t ret_code;
     911                 :   pdf_char_t *temp_data;
     912                 :   pdf_size_t temp_size;
     913                 : 
     914             113 :   if(str == NULL)
     915                 :     {
     916               0 :       return PDF_EBADDATA;
     917                 :     }
     918                 : 
     919             113 :   ret_code = pdf_text_pdfdocenc_to_utf32he (str, strlen((char *)str),
     920                 :                                             &temp_data, &temp_size);
     921             113 :   if(ret_code == PDF_OK)
     922                 :     {
     923                 :       /* Destroy previous contents of text variable, if any */
     924             112 :       pdf_text_clean_contents(text);
     925                 : 
     926                 :       /* Really set contents */
     927             112 :       text->data = temp_data;
     928             112 :       text->size = temp_size;
     929                 :     }
     930             113 :   return ret_code;
     931                 : }
     932                 : 
     933                 : 
     934                 : pdf_status_t
     935                 : pdf_text_set_unicode (pdf_text_t text,
     936                 :                       const pdf_char_t *str,
     937                 :                       const pdf_size_t size,
     938                 :                       const enum pdf_text_unicode_encoding_e enc)
     939             693 : {
     940             693 :   pdf_status_t ret_code = PDF_ETEXTENC;
     941                 :   pdf_char_t *temp_data;
     942                 :   pdf_size_t temp_size;
     943                 :   enum pdf_text_unicode_encoding_e new_enc;
     944                 : 
     945             693 :   if((str == NULL) || \
     946                 :      (size == 0))
     947                 :     {
     948               0 :       return PDF_EBADDATA;
     949                 :     }
     950                 : 
     951                 :   /* If host endianness required, check it and convert input encoding */
     952             693 :   new_enc = pdf_text_transform_he_to_unicode_encoding(enc);
     953                 : 
     954             693 :   switch(new_enc)
     955                 :   {
     956                 :     case PDF_TEXT_UTF8: /* UTF-8 */
     957             127 :       ret_code = pdf_text_utf8_to_utf32he(str, size,
     958                 :                                           &temp_data, &temp_size);
     959             127 :       break;
     960                 :     case PDF_TEXT_UTF16_LE: /* UTF-16LE */
     961              70 :       ret_code = pdf_text_utf16le_to_utf32he(str, size,
     962                 :                                              &temp_data, &temp_size);
     963              70 :       break;
     964                 :     case PDF_TEXT_UTF16_BE: /* UTF-16BE */
     965              45 :       ret_code = pdf_text_utf16be_to_utf32he(str, size,
     966                 :                                              &temp_data, &temp_size,
     967                 :                                              NULL, NULL);
     968              45 :       break;
     969                 :     case PDF_TEXT_UTF32_LE: /* UTF-32LE */
     970              70 :       ret_code = pdf_text_utf32le_to_utf32he(str, size,
     971                 :                                              &temp_data, &temp_size);
     972              70 :       break;
     973                 :     case PDF_TEXT_UTF32_BE: /* UTF-32BE */
     974             381 :       ret_code = pdf_text_utf32be_to_utf32he(str, size,
     975                 :                                              &temp_data, &temp_size);
     976             381 :       break;
     977                 :     default:
     978               0 :       ret_code = PDF_EBADDATA;
     979                 :   }
     980                 : 
     981             693 :   if(ret_code == PDF_OK)
     982                 :     {
     983                 :       /* Destroy previous contents of text variable, if any */
     984             573 :       pdf_text_clean_contents(text);
     985                 : 
     986                 :       /* Really set contents */
     987             573 :       text->data = temp_data;
     988             573 :       text->size = temp_size;
     989                 :     }
     990             693 :   return ret_code;
     991                 : }
     992                 : 
     993                 : 
     994                 : /* Concatenate the two text variables, only if country/lang info is equal */
     995                 : pdf_status_t
     996                 : pdf_text_concat (pdf_text_t text1,
     997                 :                  const pdf_text_t text2,
     998                 :                  const pdf_bool_t override_langinfo)
     999              16 : {
    1000              16 :   if(!override_langinfo)
    1001                 :     {
    1002                 :       /* An error will be returned if lang code is different */
    1003               7 :       if(strcmp((char *)text1->lang, (char *)text2->lang) != 0)
    1004                 :         {
    1005               3 :           return PDF_ETEXTENC;
    1006                 :         }
    1007                 : 
    1008                 :       /* An error will be returned if country code is different */
    1009               4 :       if(strcmp((char *)text1->country, (char *)text2->country) != 0)
    1010                 :         {
    1011               0 :           return PDF_ETEXTENC;
    1012                 :         }
    1013                 :     }
    1014                 : 
    1015                 :   /* Ok, so language/country info is equal or non-existent, start
    1016                 :    *  concatenation */
    1017              13 :   if(text2->size > 0)
    1018                 :     {
    1019                 :       pdf_char_t * tmp;
    1020                 :       /* Re-allocate memory in first text element */
    1021               9 :       tmp = (pdf_char_t *)pdf_realloc (text1->data,
    1022                 :                                        text1->size + text2->size);
    1023                 : 
    1024               9 :       if (tmp == NULL)
    1025                 :         {
    1026               0 :           return PDF_ENOMEM;
    1027                 :         }
    1028                 : 
    1029               9 :       text1->data = tmp;
    1030                 : 
    1031                 :       /* Copy contents of second element after the first one */
    1032               9 :       memcpy(&(text1->data[text1->size]), text2->data, text2->size);
    1033                 : 
    1034                 :       /* Update size of first element */
    1035               9 :       text1->size += text2->size;
    1036                 : 
    1037               9 :       text1->modified = PDF_TRUE;
    1038                 :     }
    1039                 : 
    1040              13 :   return PDF_OK;
    1041                 : }
    1042                 : 
    1043                 : 
    1044                 : /* Concatenate a text variable with an ascii string */
    1045                 : pdf_status_t
    1046                 : pdf_text_concat_ascii (pdf_text_t text1,
    1047                 :                        const pdf_char_t * ascii_str)
    1048               4 : {
    1049                 :   pdf_size_t len;
    1050                 : 
    1051               4 :   len = (pdf_size_t) strlen ((char*)ascii_str);
    1052               4 :   if (!pdf_text_is_ascii7 (ascii_str, len))
    1053                 :     {
    1054               0 :       return PDF_EBADDATA;
    1055                 :     }
    1056                 : 
    1057                 :   /* now convert to utf32he and concatenate */
    1058               4 :   if(len > 0)
    1059                 :     {
    1060                 :       pdf_char_t * newbuf;
    1061                 :       pdf_status_t ret;
    1062                 :       pdf_char_t *tmp_data;
    1063                 :       pdf_size_t tmp_size;
    1064                 : 
    1065                 :       /* ascii string is valid utf8 */
    1066               2 :       ret = pdf_text_utf8_to_utf32he (ascii_str, len, &tmp_data, &tmp_size);
    1067               2 :       if (ret != PDF_OK)
    1068                 :         {
    1069               0 :           return ret;
    1070                 :         }     
    1071                 : 
    1072               2 :       newbuf = (pdf_char_t *)pdf_realloc (text1->data, text1->size + tmp_size);
    1073               2 :       if (newbuf == NULL)
    1074                 :         {
    1075               0 :           return PDF_ENOMEM;
    1076                 :         }
    1077                 :       else
    1078                 :         {
    1079               2 :           text1->data = newbuf;
    1080                 :         }
    1081                 : 
    1082               2 :       memcpy (&(text1->data[text1->size]), tmp_data, tmp_size);
    1083               2 :       text1->size += tmp_size;
    1084               2 :       pdf_dealloc (tmp_data);
    1085                 :     }
    1086                 : 
    1087               4 :   return PDF_OK;
    1088                 : }
    1089                 : 
    1090                 : 
    1091                 : /* Default initial size of the list of replacements */
    1092                 : #define PDF_TEXT_ISLR   32
    1093                 : 
    1094                 : /* Replace a given pattern in a text object */
    1095                 : 
    1096                 : pdf_status_t
    1097                 : pdf_text_replace (pdf_text_t text,
    1098                 :                   const pdf_text_t new_pattern,
    1099                 :                   const pdf_text_t old_pattern)
    1100              24 : {
    1101              24 :   return pdf_text_replace_multiple(text, new_pattern, &old_pattern, 1);
    1102                 : }
    1103                 : 
    1104                 : typedef struct pdf_text_repl_s {
    1105                 :   pdf_char_t *data_ptr;
    1106                 :   int old_pattern_i;
    1107                 : } pdf_text_repl_t;
    1108                 : 
    1109                 : 
    1110                 : /* Check replacement patterns and get minimum size */
    1111                 : static pdf_status_t
    1112                 : pdf_text_check_replacement_patterns(const pdf_text_t *p_old_patterns,
    1113                 :                                     const int n_old_patterns,
    1114                 :                                     pdf_size_t *p_min_old_pattern_size)
    1115                 : {
    1116              28 :   pdf_size_t minimum_old_pattern_size = -1;
    1117                 :   int i_pattern;
    1118                 : 
    1119              66 :   for(i_pattern = 0; i_pattern < n_old_patterns; ++i_pattern)
    1120                 :     {
    1121                 :       /* Get minimum old pattern size */
    1122              40 :       if((i_pattern == 0) || \
    1123                 :          ((p_old_patterns[i_pattern])->size < minimum_old_pattern_size))
    1124                 :         {
    1125              32 :           minimum_old_pattern_size = (p_old_patterns[i_pattern])->size;
    1126                 :         }
    1127                 :       /* Empty old pattern is not allowed */
    1128              80 :       if(pdf_text_empty_p(p_old_patterns[i_pattern]))
    1129                 :         {
    1130               2 :           return PDF_ETEXTENC;
    1131                 :         }
    1132                 :     }
    1133                 : 
    1134                 :   /* Set output var and exit correctly */
    1135              26 :   *p_min_old_pattern_size = minimum_old_pattern_size;
    1136              26 :   return PDF_OK;
    1137                 : }
    1138                 : 
    1139                 : pdf_status_t
    1140                 : pdf_text_get_replacement_pointers(pdf_text_repl_t **p_rep_ptrs, \
    1141                 :                                   long *p_n_replacements, \
    1142                 :                                   pdf_size_t *p_new_size, \
    1143                 :                                   const pdf_text_t text, \
    1144                 :                                   const pdf_size_t minimum_old_pattern_size, \
    1145                 :                                   const pdf_text_t new_pattern, \
    1146                 :                                   const pdf_text_t *p_old_patterns, \
    1147                 :                                   const int n_old_patterns)
    1148              20 : {
    1149                 :   pdf_size_t new_size;
    1150                 :   int i_pattern;
    1151                 :   long i;
    1152                 :   long n_replacements;
    1153              20 :   pdf_text_repl_t *rep_ptrs = NULL;
    1154              20 :   long rep_ptrs_size = PDF_TEXT_ISLR/2;
    1155                 : 
    1156              20 :   n_replacements = 0;
    1157              20 :   i = 0;
    1158              20 :   new_size = 0;
    1159             596 :   while(i <= (text->size - minimum_old_pattern_size))
    1160                 :     {
    1161                 :       /* If old pattern found... */
    1162             556 :       int old_pattern_found = 0;
    1163             556 :       i_pattern = 0;
    1164            2040 :       while((!old_pattern_found) && \
    1165                 :             (i_pattern < n_old_patterns))
    1166                 :         {
    1167             962 :           if(((text->size - i) >= ((p_old_patterns[i_pattern])->size)) && \
    1168                 :              (memcmp(&(text->data[i]), \
    1169                 :                      (p_old_patterns[i_pattern])->data,
    1170                 :                      (p_old_patterns[i_pattern])->size)==0))
    1171                 :             {
    1172              34 :               old_pattern_found = 1;
    1173                 :               /* Duplicate size of replacement pointers list, if needed */
    1174              34 :               if((rep_ptrs == NULL) || \
    1175                 :                  (rep_ptrs_size == n_replacements))
    1176                 :                 {
    1177              12 :                   rep_ptrs = (pdf_text_repl_t *)pdf_realloc(rep_ptrs,
    1178                 :                                                             2 * rep_ptrs_size * \
    1179                 :                                                             sizeof(pdf_text_repl_t));
    1180              12 :                   if(rep_ptrs == NULL)
    1181                 :                     {
    1182               0 :                       return PDF_ENOMEM;
    1183                 :                     }
    1184                 :                 }
    1185                 :               /* Store pointer to old pattern */
    1186              34 :               rep_ptrs[n_replacements].data_ptr = &(text->data[i]);
    1187              34 :               rep_ptrs[n_replacements].old_pattern_i = i_pattern;
    1188              34 :               n_replacements++;
    1189                 :               /* The index must be updated to skip the replacement */
    1190              34 :               i += (p_old_patterns[i_pattern])->size;
    1191                 :               /* Update new size */
    1192              34 :               new_size += new_pattern->size;
    1193                 :             }
    1194                 :           else
    1195                 :             {
    1196             894 :               i_pattern++;
    1197                 :             }
    1198                 :         }
    1199             556 :       if(!old_pattern_found)
    1200                 :         {
    1201             522 :           i+=4;
    1202             522 :           new_size +=4;
    1203                 :         }
    1204                 :     }
    1205                 : 
    1206                 :   /* Udpate new size with remaining data in old array */
    1207              20 :   new_size += (text->size - i);
    1208                 : 
    1209                 :   /* Set output data and exit correctly */
    1210              20 :   *p_new_size = new_size;
    1211              20 :   *p_rep_ptrs = rep_ptrs;
    1212              20 :   *p_n_replacements = n_replacements;
    1213                 : 
    1214              20 :   return PDF_OK;
    1215                 : }
    1216                 : 
    1217                 : static pdf_status_t
    1218                 : pdf_text_perform_replacements(pdf_text_t text, \
    1219                 :                               const pdf_size_t new_size, \
    1220                 :                               const pdf_text_t new_pattern, \
    1221                 :                               const pdf_text_t *p_old_patterns, \
    1222                 :                               const int n_old_patterns, \
    1223                 :                               const pdf_text_repl_t *rep_ptrs, \
    1224                 :                               const long n_replacements)
    1225                 : {
    1226                 :   int k;
    1227                 :   pdf_char_t *new_data;
    1228                 :   pdf_char_t *new_walker;
    1229                 :   pdf_char_t *old_walker;
    1230                 : 
    1231                 :   /* Allocate new memory chunk */
    1232              12 :   new_data = (pdf_char_t *)pdf_alloc(new_size);
    1233                 : 
    1234                 :   /* Walk the list of replacements */
    1235              12 :   new_walker = new_data;
    1236              12 :   old_walker = text->data;
    1237              46 :   for(k = 0; k < n_replacements; ++k)
    1238                 :     {
    1239                 :       pdf_size_t prev_size;
    1240                 :       /* Store the data previous to the pointer */
    1241              34 :       prev_size = (rep_ptrs[k].data_ptr - old_walker);
    1242              34 :       if(prev_size > 0)
    1243                 :         {
    1244              26 :           memcpy(new_walker, old_walker, prev_size);
    1245              26 :           new_walker += prev_size;
    1246              26 :           old_walker += prev_size;
    1247                 :         }
    1248                 :       /* Perform the replacement */
    1249              34 :       memcpy(new_walker, new_pattern->data, new_pattern->size);
    1250              34 :       new_walker += (new_pattern->size);
    1251              34 :       old_walker += (p_old_patterns[rep_ptrs[k].old_pattern_i]->size);
    1252                 :     }
    1253                 : 
    1254                 :   /* Add final data */
    1255              12 :   if(((&(text->data[text->size])) - old_walker) > 0)
    1256                 :     {
    1257               2 :       memcpy(new_walker, old_walker, \
    1258                 :              ((&(text->data[text->size])) - old_walker));
    1259                 :     }
    1260                 : 
    1261                 :   /* Set correct final size and final content */
    1262              12 :   pdf_dealloc(text->data);
    1263              12 :   text->data = new_data;
    1264              12 :   text->size = new_size;
    1265                 : 
    1266              12 :   return PDF_OK;
    1267                 : }
    1268                 : 
    1269                 : 
    1270                 : pdf_status_t
    1271                 : pdf_text_replace_multiple (pdf_text_t text,
    1272                 :                            const pdf_text_t new_pattern,
    1273                 :                            const pdf_text_t *p_old_patterns,
    1274                 :                            const int n_old_patterns)
    1275              28 : {
    1276              28 :   pdf_size_t new_size = 0;
    1277              28 :   pdf_size_t minimum_old_pattern_size = -1;
    1278                 :   long n_replacements;
    1279              28 :   pdf_text_repl_t *rep_ptrs = NULL;
    1280                 : 
    1281              28 :   if((p_old_patterns == NULL) || \
    1282                 :      (n_old_patterns == 0))
    1283                 :     {
    1284               0 :       return PDF_EBADDATA;
    1285                 :     }
    1286                 : 
    1287              28 :   if(pdf_text_check_replacement_patterns(p_old_patterns, \
    1288                 :                                          n_old_patterns, \
    1289                 :                                          &minimum_old_pattern_size) != PDF_OK)
    1290                 :     {
    1291                 :       PDF_DEBUG_BASE("At least one old pattern is not valid");
    1292                 :       /* At least one old pattern is not valid */
    1293               2 :       return PDF_ETEXTENC;
    1294                 :     }
    1295                 : 
    1296                 :   /* If input text is shorter than the smallest old pattern, there is no
    1297                 :    *  replacement to be done */
    1298              26 :   if(minimum_old_pattern_size > text->size)
    1299                 :     {
    1300               6 :       return PDF_OK;
    1301                 :     }
    1302                 : 
    1303                 :   /* First, count number of replacements to be done... a replacement pointer
    1304                 :    * will be stored for each replacement needed */
    1305              20 :   if(pdf_text_get_replacement_pointers(&rep_ptrs, \
    1306                 :                                        &n_replacements, \
    1307                 :                                        &new_size, \
    1308                 :                                        text, \
    1309                 :                                        minimum_old_pattern_size, \
    1310                 :                                        new_pattern, \
    1311                 :                                        p_old_patterns, \
    1312                 :                                        n_old_patterns) != PDF_OK)
    1313                 :     {
    1314                 :       PDF_DEBUG_BASE("Error getting replacement pointers");
    1315               0 :       return PDF_ETEXTENC;
    1316                 :     }
    1317                 : 
    1318                 :   /* Now, really perform replacements, if required */
    1319              20 :   if(n_replacements > 0)
    1320                 :     {
    1321              12 :       pdf_text_perform_replacements(text, \
    1322                 :                                     new_size, \
    1323                 :                                     new_pattern, \
    1324                 :                                     p_old_patterns, \
    1325                 :                                     n_old_patterns, \
    1326                 :                                     rep_ptrs, \
    1327                 :                                     n_replacements);
    1328              12 :       if(rep_ptrs != NULL)
    1329                 :         {
    1330                 :           /* Dealloc list of pointers to replacements */
    1331              12 :           pdf_dealloc(rep_ptrs);
    1332                 :         }
    1333                 : 
    1334              12 :       text->modified = PDF_TRUE;
    1335                 :     }
    1336                 : 
    1337              20 :   return PDF_OK;
    1338                 : }
    1339                 : 
    1340                 : 
    1341                 : /* Replace a given ASCII-7 pattern in a text object */
    1342                 : pdf_status_t
    1343                 : pdf_text_replace_ascii (pdf_text_t text,
    1344                 :                         const pdf_char_t *new_pattern,
    1345                 :                         const pdf_char_t *old_pattern)
    1346              14 : {
    1347                 :   /* Check if patterns are real ASCII-7 valid strings */
    1348              14 :   if((!pdf_text_is_ascii7(old_pattern,
    1349                 :                          (pdf_size_t)strlen((char *)old_pattern))) || \
    1350                 :      (!pdf_text_is_ascii7(new_pattern,
    1351                 :                          (pdf_size_t)strlen((char *)new_pattern))))
    1352                 :     {
    1353                 :       PDF_DEBUG_BASE("At least one of the requested patterns is not "
    1354                 :                      "7-bit ASCII");
    1355               0 :       return PDF_EBADDATA;
    1356                 :     }
    1357                 :   else
    1358                 :     {
    1359                 :       /* Ok, so load ASCII strings as if it were UTF-8 strings */
    1360                 :       pdf_text_t new_pattern_text;
    1361                 :       pdf_text_t old_pattern_text;
    1362                 :       pdf_status_t ret_code;
    1363                 : 
    1364                 :       /* Create intermediate pdf_text_t variables */
    1365              14 :       if(pdf_text_new_from_unicode(new_pattern,
    1366                 :                                    (pdf_size_t) strlen ((char *) new_pattern),
    1367                 :                                    PDF_TEXT_UTF8,
    1368                 :                                    &new_pattern_text) != PDF_OK)
    1369                 :         {
    1370                 :           PDF_DEBUG_BASE("Error creating pdf_text_t from ASCII new pattern");
    1371               0 :           return PDF_EBADTEXT;
    1372                 :         }
    1373              14 :       if(pdf_text_new_from_unicode(old_pattern,
    1374                 :                                    (pdf_size_t) strlen ((char *)old_pattern),
    1375                 :                                    PDF_TEXT_UTF8,
    1376                 :                                    &old_pattern_text) != PDF_OK)
    1377                 :         {
    1378                 :           PDF_DEBUG_BASE("Error creating pdf_text_t from ASCII old pattern");
    1379               0 :           return PDF_EBADTEXT;
    1380                 :         }
    1381                 : 
    1382                 :       /* Perform replacement */
    1383              14 :       ret_code = pdf_text_replace(text, new_pattern_text, old_pattern_text);
    1384                 : 
    1385                 :       /* Destroy used intermediate variables */
    1386              14 :       pdf_text_destroy(new_pattern_text);
    1387              14 :       pdf_text_destroy(old_pattern_text);
    1388                 : 
    1389              14 :       return ret_code;
    1390                 :     }
    1391                 : }
    1392                 : 
    1393                 : 
    1394                 : pdf_status_t
    1395                 : pdf_text_filter (pdf_text_t text,
    1396                 :                  const pdf_u32_t filter)
    1397              63 : {
    1398                 :   /* More than one filter at the same time can be requested! But Caution!
    1399                 :    *  UpperCase filter, LowerCase filter and TitleCase filter are mutually
    1400                 :    *  exclusive (at most only one of them must be enabled) */
    1401                 : 
    1402              63 :   if((((filter & PDF_TEXT_FILTER_UPPER_CASE) ? 1 : 0) + \
    1403                 :       ((filter & PDF_TEXT_FILTER_LOWER_CASE) ? 1 : 0) + \
    1404                 :       ((filter & PDF_TEXT_FILTER_TITLE_CASE) ? 1 : 0)) > 1)
    1405                 :     {
    1406                 :       PDF_DEBUG_BASE("At most only one case conversion filter can be applied");
    1407               0 :       return PDF_EBADDATA;
    1408                 :     }
    1409                 : 
    1410                 :   /* 0x00000001 */
    1411              63 :   if((filter & PDF_TEXT_FILTER_LINE_ENDINGS) && \
    1412                 :      (pdf_text_filter_normalize_line_endings(text) != PDF_OK))
    1413                 :     {
    1414                 :       PDF_DEBUG_BASE("Error applying Line Ending normalization filter");
    1415               0 :       return PDF_ETEXTENC;
    1416                 :     }
    1417                 : 
    1418                 :   /* 0x00000010 */
    1419              63 :   if((filter & PDF_TEXT_FILTER_UPPER_CASE) && \
    1420                 :      (pdf_text_filter_upper_case(text) != PDF_OK))
    1421                 :     {
    1422                 :       PDF_DEBUG_BASE("Error applying Upper Case filter");
    1423               0 :       return PDF_ETEXTENC;
    1424                 :     }
    1425                 :   /* 0x00000100 */
    1426              63 :   else if((filter & PDF_TEXT_FILTER_LOWER_CASE) && \
    1427                 :           (pdf_text_filter_lower_case(text) != PDF_OK))
    1428                 :     {
    1429                 :       PDF_DEBUG_BASE("Error applying Lower Case filter");
    1430               0 :       return PDF_ETEXTENC;
    1431                 :     }
    1432                 :   /* 0x00001000 */
    1433              63 :   else if((filter & PDF_TEXT_FILTER_TITLE_CASE) && \
    1434                 :            (pdf_text_filter_title_case(text) != PDF_OK))
    1435                 :     {
    1436                 :       PDF_DEBUG_BASE("Error applying Title Case filter");
    1437               0 :       return PDF_ETEXTENC;
    1438                 :     }
    1439                 : 
    1440                 :   /* 0x00010000 */
    1441              63 :   if((filter & PDF_TEXT_FILTER_REMOVE_AMP) && \
    1442                 :      (pdf_text_filter_remove_amp(text) != PDF_OK))
    1443                 :     {
    1444                 :       PDF_DEBUG_BASE("Error applying Ampersand Removal filter");
    1445               0 :       return PDF_ETEXTENC;
    1446                 :     }
    1447                 : 
    1448                 :   /* 0x00100000 */
    1449              63 :   if((filter & PDF_TEXT_FILTER_NORM_WITH_FULL_WIDTH) && \
    1450                 :      (pdf_text_filter_normalize_full_width_ascii(text) != PDF_OK))
    1451                 :     {
    1452                 :       PDF_DEBUG_BASE("Error applying FullWidth Normalization filter");
    1453               0 :       return PDF_ETEXTENC;
    1454                 :     }
    1455                 : 
    1456                 :   /* 0x01000000 */
    1457              63 :   if ((filter & PDF_TEXT_FILTER_REMOVE_LINE_ENDINGS) && 
    1458                 :      (pdf_text_filter_remove_line_endings (text) != PDF_OK))
    1459                 :     {
    1460                 :       PDF_DEBUG_BASE ("Error applying Line Ending Removal filter");
    1461               0 :       return PDF_ETEXTENC;
    1462                 :     }
    1463                 : 
    1464              63 :   text->modified = PDF_TRUE;
    1465              63 :   return PDF_OK;
    1466                 : }
    1467                 : 
    1468                 : const pdf_char_t *
    1469                 : pdf_text_get_printable (pdf_text_t text)
    1470               0 : {
    1471                 :   pdf_size_t size;
    1472                 : 
    1473               0 :   if (text->printable != NULL){
    1474               0 :     if (text->modified == PDF_FALSE){
    1475               0 :       return text->printable;
    1476                 :     }else{
    1477               0 :       pdf_dealloc (text->printable);
    1478                 :     }
    1479                 :   }
    1480                 : 
    1481                 : #ifdef PDF_HOST_WIN32
    1482                 :   pdf_text_get_unicode (&text->printable, &size, text, PDF_TEXT_UTF16_LE,
    1483                 :                         PDF_TEXT_UNICODE_WITH_NUL_SUFFIX);
    1484                 : #else
    1485               0 :   pdf_text_get_unicode (&text->printable, &size, text, PDF_TEXT_UTF8,
    1486                 :                         PDF_TEXT_UNICODE_WITH_NUL_SUFFIX);
    1487                 : #endif /*PDF_HOST_WIN32*/
    1488                 : 
    1489               0 :   text->modified = PDF_FALSE;
    1490                 : 
    1491               0 :   return text->printable;
    1492                 : }
    1493                 : 
    1494                 : 
    1495                 : pdf_i32_t
    1496                 : pdf_text_cmp (const pdf_text_t text1,
    1497                 :               const pdf_text_t text2,
    1498                 :               const pdf_bool_t case_sensitive,
    1499                 :               pdf_status_t *p_ret_code)
    1500               9 : {
    1501               9 :   if(p_ret_code != NULL)
    1502                 :     {
    1503               6 :       *p_ret_code = PDF_OK;
    1504                 :     }
    1505                 : 
    1506                 :   /* Compare sizes of the texts */
    1507               9 :   if(text1->size != text2->size)
    1508                 :     {
    1509               2 :       return ((text1->size > text2->size) ? 1 : -1);
    1510                 :     }
    1511                 : 
    1512               7 :   if(case_sensitive == PDF_TRUE)
    1513                 :     {
    1514               5 :       return memcmp(text1->data, text2->data, text1->size);
    1515                 :     }
    1516                 :   else
    1517                 :     {
    1518               2 :       return pdf_text_cmp_non_case_sensitive(text1, text2, p_ret_code);
    1519                 :     }
    1520                 : }
    1521                 : 
    1522                 : 
    1523                 : /* -------------------------- Private functions ----------------------------- */
    1524                 : 
    1525                 : static pdf_i32_t
    1526                 : pdf_text_cmp_non_case_sensitive(pdf_text_t text1,
    1527                 :                                 pdf_text_t text2,
    1528                 :                                 pdf_status_t *p_ret_code)
    1529               2 : {
    1530                 :   /* Generate word boundaries list, if not already done */
    1531               2 :   if((pdf_text_fill_word_boundaries_list(text1->word_boundaries, \
    1532                 :                                          text1->data, \
    1533                 :                                          text1->size) == PDF_OK) && \
    1534                 :      (pdf_text_fill_word_boundaries_list(text2->word_boundaries, \
    1535                 :                                          text2->data, \
    1536                 :                                          text2->size) == PDF_OK))
    1537                 :     {
    1538                 :       pdf_size_t size1;
    1539                 :       pdf_size_t size2;
    1540                 : 
    1541               4 :       size1 = pdf_list_size(text1->word_boundaries);
    1542               4 :       size2 = pdf_list_size(text2->word_boundaries);
    1543                 :       /* First, compare number of words in each text */
    1544               2 :       if(size1 != size2)
    1545                 :         {
    1546                 :           PDF_DEBUG_BASE("Different sizes...");
    1547               0 :           return ((size1 > size2) ? 1 : -1);
    1548                 :         }
    1549                 :       else
    1550                 :         {
    1551                 :           /* Perform a word-per-word lower case comparison! */
    1552                 :           int n;
    1553                 : 
    1554                 :           /* Get word from both texts */
    1555               2 :           n = 0;
    1556              22 :           while(n < size1)
    1557                 :             {
    1558                 :               struct pdf_text_wb_s *p_word1;
    1559                 :               struct pdf_text_wb_s *p_word2;
    1560                 :               pdf_i32_t ret_num;
    1561                 : 
    1562              18 :               if(pdf_list_get_at(text1->word_boundaries, \
    1563                 :                                  n, \
    1564                 :                                  (const void **)&p_word1) != PDF_OK)
    1565                 :                 {
    1566               0 :                   *p_ret_code = PDF_ETEXTENC;
    1567                 :                   PDF_DEBUG_BASE("Error getting word '%d' from text1", n);
    1568                 :                   /* An error happened computing word boundaries! */
    1569               0 :                   return -1;
    1570                 :                 }
    1571                 : 
    1572              18 :               if(pdf_list_get_at(text2->word_boundaries,
    1573                 :                                  n,
    1574                 :                                  (const void **)&p_word2) != PDF_OK)
    1575                 :                 {
    1576               0 :                   *p_ret_code = PDF_ETEXTENC;
    1577                 :                   PDF_DEBUG_BASE("Error getting word '%d' from text2", n);
    1578                 :                   /* An error happened computing word boundaries! */
    1579               0 :                   return -1;
    1580                 :                 }
    1581                 : 
    1582              18 :               ret_num = pdf_text_compare_words(p_word1->word_start,
    1583                 :                                                p_word1->word_size,
    1584                 :                                                p_word2->word_start,
    1585                 :                                                p_word2->word_size,
    1586                 :                                                pdf_text_get_language(text1),
    1587                 :                                                pdf_text_get_language(text2),
    1588                 :                                                p_ret_code);
    1589                 :               /* If words are not equal, return the code */
    1590              18 :               if(ret_num != 0)
    1591                 :                 {
    1592                 :                   PDF_DEBUG_BASE("Words are not equal...");
    1593               0 :                   return ret_num;
    1594                 :                 }
    1595              18 :               ++n;
    1596                 :             }
    1597                 :           /* If arrived here, the strings are completely equal */
    1598               2 :           return 0;
    1599                 :         }
    1600                 :     }
    1601                 :   else
    1602                 :     {
    1603               0 :       if(p_ret_code != NULL)
    1604                 :         {
    1605               0 :           *p_ret_code = PDF_ETEXTENC;
    1606                 :         }
    1607                 :       PDF_DEBUG_BASE("Problem computing word boundaries. Comparison is not"
    1608                 :                      " valid");
    1609               0 :       return -1; /* An error happened computing word boundaries! */
    1610                 :     }
    1611                 :   return 0;
    1612                 : }
    1613                 : 
    1614                 : 
    1615                 : static pdf_i32_t
    1616                 : pdf_text_compare_words(const pdf_char_t *word1,
    1617                 :                        const pdf_size_t size1,
    1618                 :                        const pdf_char_t *word2,
    1619                 :                        const pdf_size_t size2,
    1620                 :                        const pdf_char_t *language1,
    1621                 :                        const pdf_char_t *language2,
    1622                 :                        pdf_status_t *p_ret_code)
    1623              18 : {
    1624                 :   pdf_char_t *lower1;
    1625                 :   pdf_char_t *lower2;
    1626                 :   pdf_size_t new_size1;
    1627                 :   pdf_size_t new_size2;
    1628                 :   pdf_size_t worst_size;
    1629                 : 
    1630              18 :   if(p_ret_code != NULL)
    1631                 :     {
    1632              18 :       *p_ret_code = PDF_OK;
    1633                 :     }
    1634                 : 
    1635                 :   /* Compare sizes of words */
    1636              18 :   if(size1 != size2)
    1637                 :     {
    1638               0 :       return ((size1 > size2) ? 1 : -1);
    1639                 :     }
    1640                 : 
    1641                 :   /* Compute new worst word length */
    1642              18 :   worst_size = size1 * UCD_SC_MAX_EXPAND;
    1643                 : 
    1644                 :   /* Allocate memory for lowercases */
    1645              18 :   lower1 = (pdf_char_t *)pdf_alloc(worst_size);
    1646              18 :   lower2 = (pdf_char_t *)pdf_alloc(worst_size);
    1647              18 :   if((lower1 == NULL) || \
    1648                 :      (lower2 == NULL))
    1649                 :     {
    1650                 :       PDF_DEBUG_BASE("Unable to compare words");
    1651               0 :       if(p_ret_code != NULL)
    1652                 :         {
    1653               0 :           *p_ret_code = PDF_ENOMEM;
    1654                 :         }
    1655               0 :       if(lower1 != NULL)
    1656                 :         {
    1657               0 :           pdf_dealloc(lower1);
    1658                 :         }
    1659               0 :       if(lower2 != NULL)
    1660                 :         {
    1661               0 :           pdf_dealloc(lower2);
    1662                 :         }
    1663               0 :       return -1;
    1664                 :     }
    1665                 : 
    1666                 :   /* Lowercase words */
    1667              18 :   if(pdf_text_ucd_word_change_case(lower1, &new_size1,
    1668                 :                                    UNICODE_CASE_INFO_LOWER_CASE,
    1669                 :                                    word1, size1, language1)!= PDF_OK)
    1670                 :     {
    1671                 :       PDF_DEBUG_BASE("Problem lowercasing word 1");
    1672               0 :       pdf_dealloc(lower1);
    1673               0 :       pdf_dealloc(lower2);
    1674               0 :       if(p_ret_code != NULL)
    1675                 :         {
    1676               0 :           *p_ret_code = PDF_ETEXTENC;
    1677                 :         }
    1678               0 :       return -1;
    1679                 :     }
    1680              18 :   if(pdf_text_ucd_word_change_case(lower2, &new_size2,
    1681                 :                                    UNICODE_CASE_INFO_LOWER_CASE,
    1682                 :                                    word2, size2, language2)!= PDF_OK)
    1683                 :     {
    1684                 :       PDF_DEBUG_BASE("Problem lowercasing word 2");
    1685               0 :       pdf_dealloc(lower1);
    1686               0 :       pdf_dealloc(lower2);
    1687               0 :       if(p_ret_code != NULL)
    1688                 :         {
    1689               0 :           *p_ret_code = PDF_ETEXTENC;
    1690                 :         }
    1691               0 :       return -1;
    1692                 :     }
    1693                 : 
    1694                 :   /* Compare NEW sizes of words */
    1695              18 :   if(new_size1 != new_size2)
    1696                 :     {
    1697               0 :       pdf_dealloc(lower1);
    1698               0 :       pdf_dealloc(lower2);
    1699               0 :       return ((new_size1 > new_size2) ? 1 : -1);
    1700                 :     }
    1701                 :   else
    1702                 :     {
    1703                 :       /* Compare contents of words */
    1704                 :       pdf_i32_t ret_val;
    1705              18 :       ret_val = memcmp(lower1, lower2, new_size1);
    1706              18 :       pdf_dealloc(lower1);
    1707              18 :       pdf_dealloc(lower2);
    1708              18 :       return ret_val;
    1709                 :     }
    1710                 : }
    1711                 : 
    1712                 : 
    1713                 : /* Function to clean all contents of a given pdf_text_t variable */
    1714                 : void
    1715                 : pdf_text_clean_contents(pdf_text_t text)
    1716             691 : {
    1717                 :   /* Clear all contents */
    1718             691 :   if(text->data != NULL)
    1719                 :     {
    1720               0 :       pdf_dealloc(text->data);
    1721               0 :       text->data = NULL;
    1722                 :     }
    1723                 : 
    1724                 :   /* Clean list of word breaks (destroy and create empty) */
    1725             691 :   pdf_text_clean_word_boundaries_list(&(text->word_boundaries));
    1726                 : 
    1727                 :   /* Clean country and language info */
    1728             691 :   memset(&(text->lang[0]), 0, PDF_TEXT_CCL);
    1729             691 :   memset(&(text->country[0]), 0, PDF_TEXT_CCL);
    1730                 :   /* Reset data size */
    1731             691 :   text->size = 0;
    1732                 : 
    1733             691 :   text->modified = PDF_FALSE;
    1734             691 :   if (text->printable != NULL){
    1735               0 :     pdf_dealloc (text->printable);
    1736               0 :     text->printable = NULL;
    1737                 :   }
    1738             691 : }
    1739                 : 
    1740                 : 
    1741                 : 
    1742                 : static pdf_status_t
    1743                 : pdf_text_get_lang_from_utf16be(pdf_text_t element,
    1744                 :                                pdf_char_t **str_out,
    1745                 :                                pdf_size_t *str_out_length,
    1746                 :                                const pdf_char_t *str_in,
    1747                 :                                const pdf_size_t str_in_length)
    1748              28 : {
    1749                 :   /* Country code is optional */
    1750              28 :   short country_available = PDF_FALSE;
    1751                 :   pdf_char_t aux[PDF_TEXT_CCL];
    1752                 : 
    1753                 :   /* Check first code marker and MINIMUM length of array */
    1754              28 :   if((str_in_length < PDF_TEXT_LCMINL) || \
    1755                 :      (str_in[1] != PDF_TEXT_LCI_1) || \
    1756                 :      (str_in[0] != PDF_TEXT_LCI_0))
    1757                 :     {
    1758               0 :       return PDF_EBADDATA;
    1759                 :     }
    1760                 : 
    1761                 :   /* Check last code marker position and MAXIMUM length of array.
    1762                 :    *  Additionally, set `str_out' and `str_out_length' */
    1763              42 :   if((str_in[5] != PDF_TEXT_LCI_1) || \
    1764                 :      (str_in[4] != PDF_TEXT_LCI_0))
    1765                 :     {
    1766                 :       /* Check last marker in bytes 6 and 7... */
    1767              14 :       if((str_in_length >= PDF_TEXT_LCMAXL) && \
    1768                 :          (str_in[7] == PDF_TEXT_LCI_1) && \
    1769                 :          (str_in[6] == PDF_TEXT_LCI_0))
    1770                 :         {
    1771              14 :           country_available = PDF_TRUE;
    1772              14 :           *str_out = (pdf_char_t *)str_in + PDF_TEXT_LCMAXL;
    1773              14 :           *str_out_length = str_in_length - PDF_TEXT_LCMAXL;
    1774                 :         }
    1775                 :       else
    1776                 :         {
    1777                 :           /* Either size is too short or last marker not found. This is a
    1778                 :            *  problem in the input data string */
    1779               0 :           return PDF_EBADDATA;
    1780                 :         }
    1781                 :     }
    1782                 :   else
    1783                 :     {
    1784                 :       /* There is no optional country code info */
    1785              14 :       *str_out = (pdf_char_t *)str_in + PDF_TEXT_LCMINL;
    1786              14 :       *str_out_length = str_in_length - PDF_TEXT_LCMINL;
    1787                 :     }
    1788                 : 
    1789                 : 
    1790                 :   /* Store 2-bytes ISO 639 language code */
    1791              28 :   memcpy(&aux[0], &str_in[2], PDF_TEXT_CCL-1);
    1792              28 :   aux[PDF_TEXT_CCL-1] = '\0';
    1793              28 :   if(pdf_text_set_language(element, (pdf_char_t *)aux) != PDF_OK)
    1794                 :     {
    1795               0 :       return PDF_ETEXTENC;
    1796                 :     }
    1797                 : 
    1798                 :   /* If optional country code is also available, store it... */
    1799              28 :   if(country_available)
    1800                 :     {
    1801              14 :       memcpy(&aux[0], &str_in[4], PDF_TEXT_CCL-1);
    1802                 :       /* Last NUL byte is already set */
    1803                 :       /* Store 2-bytes ISO 3166 country code */
    1804              14 :       if(pdf_text_set_country(element, (pdf_char_t *)aux) != PDF_OK)
    1805                 :         {
    1806               0 :           return PDF_ETEXTENC;
    1807                 :         }
    1808                 :     }
    1809                 : 
    1810              28 :   return PDF_OK;
    1811                 : }
    1812                 : 
    1813                 : static enum pdf_text_unicode_encoding_e
    1814                 : pdf_text_transform_he_to_unicode_encoding(enum pdf_text_unicode_encoding_e enc)
    1815                 : {
    1816            1121 :   if((enc == PDF_TEXT_UTF16_HE) || \
    1817                 :      (enc == PDF_TEXT_UTF32_HE))
    1818                 :     {
    1819             267 :       enc += (PDF_IS_BIG_ENDIAN ? PDF_TEXT_HE_TO_BE : PDF_TEXT_HE_TO_LE);
    1820                 :     }
    1821            1121 :   return enc;
    1822                 : }
    1823                 : 
    1824                 : 
    1825                 : static pdf_status_t
    1826                 : pdf_text_get_unicode_string_header(pdf_char_t header[PDF_TEXT_USHMAXL],
    1827                 :                                    pdf_size_t *header_length,
    1828                 :                                    const enum pdf_text_unicode_encoding_e enc,
    1829                 :                                    const pdf_u32_t options,
    1830                 :                                    const pdf_char_t *language,
    1831                 :                                    const pdf_char_t *country)
    1832             108 : {
    1833                 :   short bom_bytes;
    1834                 :   short lang_bytes;
    1835             108 :   pdf_text_bom_t bom  = pdf_text_get_unicode_bom(enc);
    1836                 : 
    1837                 :   /* We know that these pointers will never be null if the function is only
    1838                 :    * called by pdf_text_get_unicode, but just in case */
    1839             108 :   if((language == NULL) || \
    1840                 :      (country == NULL) || \
    1841                 :      (header_length == NULL))
    1842                 :     {
    1843                 :       PDF_DEBUG_BASE("Invalid pointers received");
    1844               0 :       return PDF_EBADDATA;
    1845                 :     }
    1846                 : 
    1847                 :   /* Check if BOM really requested */
    1848             108 :   bom_bytes = 0;
    1849             108 :   if(options & PDF_TEXT_UNICODE_WITH_BOM)
    1850                 :     {
    1851              84 :       bom_bytes = bom.bom_bytes;
    1852                 :     }
    1853                 : 
    1854                 :   /* Check if Lang/Country code really requested (only for UTF16BE!!) */
    1855             108 :   lang_bytes = 0;
    1856             108 :   if((enc == PDF_TEXT_UTF16_BE) && \
    1857                 :      (options & PDF_TEXT_UTF16BE_WITH_LANGCODE) && \
    1858                 :      (strlen((char *)language) == 2))
    1859                 :     {
    1860                 :       /* At least language is available, but country may also be
    1861                 :        *  available */
    1862              48 :       lang_bytes = (strlen((char *)country) == 2) ? PDF_TEXT_LCMAXL: \
    1863                 :                                                     PDF_TEXT_LCMINL;
    1864                 :     }
    1865                 : 
    1866                 :   /* Modify header array, if needed, to add Language/Country info and/or
    1867                 :    *  BOM */
    1868             108 :   *header_length = lang_bytes + bom_bytes;
    1869             108 :   if((*header_length > 0) && \
    1870                 :      (*header_length < PDF_TEXT_USHMAXL)) /* (just in case) */
    1871                 :     {
    1872                 :       pdf_char_t *walker;
    1873             108 :       walker = &header[0];
    1874                 : 
    1875                 :       /* Add BOM */
    1876             108 :       if(bom_bytes > 0)
    1877                 :         {
    1878              84 :           memcpy(walker, bom.bom_data, bom_bytes);
    1879                 :           /* Update walker */
    1880              84 :           walker += bom_bytes;
    1881                 :         }
    1882                 : 
    1883                 :       /* Add Lang/Country */
    1884             108 :       if(lang_bytes > 0)
    1885                 :         {
    1886                 :           /* Language and Country */
    1887              48 :           if(lang_bytes == PDF_TEXT_LCMAXL)
    1888                 :             {
    1889              24 :               sprintf((char *)walker, "%c%c%2s%2s%c%c",
    1890                 :                       PDF_TEXT_LCI_0,PDF_TEXT_LCI_1,
    1891                 :                       language, country,
    1892                 :                       PDF_TEXT_LCI_0,PDF_TEXT_LCI_1);
    1893                 :             }
    1894                 :           /* Language only */
    1895                 :           else
    1896                 :             {
    1897              24 :               sprintf((char *)walker, "%c%c%2s%c%c",
    1898                 :                       PDF_TEXT_LCI_0,PDF_TEXT_LCI_1,
    1899                 :                       language,
    1900                 :                       PDF_TEXT_LCI_0,PDF_TEXT_LCI_1);
    1901                 :             }
    1902                 :         }
    1903                 :     }
    1904             108 :   return PDF_OK;
    1905                 : }
    1906                 : 
    1907                 : 
    1908                 : 
    1909                 : pdf_bool_t
    1910                 : pdf_text_is_ascii7(const pdf_char_t *utf8data, const pdf_size_t size)
    1911              32 : {
    1912                 :   pdf_size_t i;
    1913             159 :   for(i=0; i<size; ++i)
    1914                 :     {
    1915                 :       /* Just check the MSB. In ASCII-7 it must be 0 */
    1916             127 :       if(utf8data[i] & 0x80)
    1917                 :         {
    1918               0 :           return PDF_FALSE;
    1919                 :         }
    1920                 :     }
    1921              32 :   return PDF_TRUE;
    1922                 : }
    1923                 : 
    1924                 : 
    1925                 : /* Generate Word Boundaries list from text object */
    1926                 : pdf_status_t
    1927                 : pdf_text_generate_word_boundaries(pdf_text_t text)
    1928              54 : {
    1929             108 :   if(pdf_list_size(text->word_boundaries) == 0)
    1930                 :     {
    1931              54 :       return pdf_text_fill_word_boundaries_list(text->word_boundaries,
    1932                 :                                                 text->data, text->size);
    1933                 :     }
    1934                 :   else
    1935                 :     {
    1936                 :       /* List already created */
    1937               0 :       return PDF_OK;
    1938                 :     }
    1939                 : }
    1940                 : 
    1941                 : pdf_status_t
    1942                 : pdf_text_destroy_word_boundaries_list(pdf_list_t *p_word_boundaries)
    1943             818 : {
    1944                 :   pdf_size_t n_words;
    1945                 :   pdf_size_t i;
    1946                 :   /* Walk list of words */
    1947            1636 :   n_words = pdf_list_size(*p_word_boundaries);
    1948            1166 :   for(i = 0; i < n_words; ++i)
    1949                 :     {
    1950             348 :       struct pdf_text_wb_s *p_word = NULL;
    1951                 : 
    1952                 :       /* Get word to process from list of words */
    1953             348 :       if(pdf_list_get_at(*p_word_boundaries, \
    1954                 :                          i, \
    1955                 :                          (const void **)&p_word) != PDF_OK)
    1956                 :         {
    1957               0 :           return PDF_ETEXTENC;
    1958                 :         }
    1959                 :       /* Dealloc word (pointed by the list element) */
    1960             348 :       pdf_dealloc(p_word);
    1961                 :     }
    1962                 :   /* Destroy list */
    1963             818 :   pdf_list_destroy(*p_word_boundaries);
    1964             818 :   return PDF_OK;
    1965                 : }
    1966                 : 
    1967                 : 
    1968                 : /* Create empty Word Boundaries list */
    1969                 : pdf_status_t
    1970                 : pdf_text_create_word_boundaries_list(pdf_list_t *p_word_boundaries)
    1971             994 : {
    1972                 :   pdf_list_t temp_list;
    1973                 :   /* Initialize word boundaries list */
    1974             994 :   if(pdf_list_new (NULL, NULL, PDF_TRUE, &temp_list) != PDF_OK)
    1975                 :     {
    1976               0 :       return PDF_ETEXTENC;
    1977                 :     }
    1978                 : 
    1979                 :   /* Set output if everything went ok */
    1980             994 :   *p_word_boundaries = temp_list;
    1981             994 :   return PDF_OK;
    1982                 : }
    1983                 : 
    1984                 : /* Clean (destroy and create empty) Word Boundaries list */
    1985                 : static pdf_status_t
    1986                 : pdf_text_clean_word_boundaries_list(pdf_list_t *p_word_boundaries)
    1987                 : {
    1988                 :   /* Only destroy+create if list is not empty! */
    1989            1382 :   if(pdf_list_size(*p_word_boundaries) != 0)
    1990                 :     {
    1991                 :       /* Destroy element contents */
    1992               0 :       pdf_text_destroy_word_boundaries_list(p_word_boundaries);
    1993                 :       /* Create empty list */
    1994               0 :       return pdf_text_create_word_boundaries_list(p_word_boundaries);
    1995                 :     }
    1996                 :   else
    1997                 :     {
    1998                 :       /* List is already empty */
    1999             691 :       return PDF_OK;
    2000                 :     }
    2001                 : }
    2002                 : 
    2003                 : 
    2004                 : /* Fill in the Word Boundaries list using the given data */
    2005                 : static pdf_status_t
    2006                 : pdf_text_fill_word_boundaries_list(pdf_list_t word_boundaries,
    2007                 :                                    const pdf_char_t *data,
    2008                 :                                    const pdf_size_t size)
    2009              58 : {
    2010                 :   /* Perform a basic check of data length */
    2011              58 :   if(size % 4 != 0)
    2012                 :     {
    2013               0 :       return PDF_EBADDATA;
    2014                 :     }
    2015                 : 
    2016                 :   /* Only try to find word boundaries if length is greater than 0! */
    2017              58 :   if(size > 0)
    2018                 :     {
    2019                 :       pdf_char_t *walker;
    2020                 :       pdf_size_t n_bytes_left;
    2021                 : 
    2022                 :       /* Initialize walker and number of bytes left */
    2023              58 :       walker = (pdf_char_t *)data;
    2024              58 :       n_bytes_left = size;
    2025                 : 
    2026             308 :       while(n_bytes_left > 0)
    2027                 :         {
    2028             192 :           struct pdf_text_wb_s *p_word = NULL;
    2029                 : 
    2030                 :           /* Allocate new word */
    2031             192 :           p_word = (struct pdf_text_wb_s *)pdf_alloc(sizeof(struct pdf_text_wb_s));
    2032             192 :           if(p_word == NULL)
    2033                 :             {
    2034               0 :               return PDF_ENOMEM;
    2035                 :             }
    2036                 : 
    2037                 :           /* RULE WB1: Break at the start of text ( SOT % ) */
    2038             192 :           p_word->word_start = walker;
    2039                 : 
    2040             192 :           if(pdf_text_ucd_wb_detect_next(walker,
    2041                 :                                          n_bytes_left,
    2042                 :                                          &(p_word->word_stop),
    2043                 :                                          &n_bytes_left)!= PDF_OK)
    2044                 :             {
    2045               0 :               return PDF_ETEXTENC;
    2046                 :             }
    2047                 : 
    2048                 :           /* Compute word size in bytes */
    2049             192 :           p_word->word_size = (p_word->word_stop - p_word->word_start) + 4;
    2050                 : 
    2051                 :           /* Add new word boundary to list */
    2052             192 :           pdf_list_add_last(word_boundaries, p_word, NULL);
    2053                 : 
    2054                 :           /* Update walker */
    2055             192 :           walker = p_word->word_stop + 4;
    2056                 :         }
    2057                 :     }
    2058                 : 
    2059              58 :   return PDF_OK;
    2060                 : }
    2061                 : 
    2062                 : 
    2063                 : /* End of pdf-text.c */

Generated by: LTP GCOV extension version 1.6