LTP GCOV extension - code coverage report
Current view: directory - src/base - pdf-text-encoding.c
Test: libgnupdf.info
Date: 2010-07-31 Instrumented lines: 248
Code covered: 89.5 % Executed lines: 222

       1                 : /* -*- mode: C -*-
       2                 :  *
       3                 :  *       File:         pdf-text-encoding.c
       4                 :  *       Date:         Fri Jan 11 21:09:56 2008
       5                 :  *
       6                 :  *       GNU PDF Library - Encoded Text handling utilities - Encoding
       7                 :  *
       8                 :  */
       9                 : 
      10                 : /* Copyright (C) 2008 Free Software Foundation, Inc. */
      11                 : 
      12                 : /* This program is free software: you can redistribute it and/or modify
      13                 :  * it under the terms of the GNU General Public License as published by
      14                 :  * the Free Software Foundation, either version 3 of the License, or
      15                 :  * (at your option) any later version.
      16                 :  *
      17                 :  * This program is distributed in the hope that it will be useful,
      18                 :  * but WITHOUT ANY WARRANTY; without even the implied warranty of
      19                 :  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
      20                 :  * GNU General Public License for more details.
      21                 :  *
      22                 :  * You should have received a copy of the GNU General Public License
      23                 :  * along with this program.  If not, see <http://www.gnu.org/licenses/>.
      24                 :  */
      25                 : 
      26                 : #include <config.h>
      27                 : 
      28                 : #include <string.h>
      29                 : #include <stdio.h>
      30                 : 
      31                 : #include <pdf-text-encoding.h>
      32                 : #include <pdf-text-context.h>
      33                 : 
      34                 : 
      35                 : #define PDF_TEXT_CHANGE_ENDIANNESS_16BIT(number) \
      36                 :   ((0x00FF & number) << 8) | ((0xFF00 & number) >> 8)
      37                 : 
      38                 : #define PDF_TEXT_CHANGE_ENDIANNESS_32BIT(number) \
      39                 :   (((0x000000FF & number) << 24) | \
      40                 :    ((0x0000FF00 & number) << 8 ) | \
      41                 :    ((0x00FF0000 & number) >> 8 ) | \
      42                 :    ((0xFF000000 & number) >> 24))
      43                 : 
      44                 : 
      45                 : /* Mapping between PDF Doc Encoding and UNICODE UTF32 (Host Endian!)
      46                 :  * Obtained from PDF Reference v1.7, appendix D.2 */
      47                 : #define PDFDOCENC_MAX            256
      48                 : static const pdf_u32_t pdfdocenc_map [PDFDOCENC_MAX] = { /* INDEXES */
      49                 :   0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, /* 00, 07 */
      50                 :   0x0000, 0x0009, 0x000A, 0x0000, 0x0000, 0x000D, 0x0000, 0x0000, /* 08, 0F */
      51                 :   0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, /* 10, 17 */
      52                 :   0x02D8, 0x02C7, 0x02C6, 0x02D9, 0x02DD, 0x02DB, 0x02DA, 0x02DC, /* 18, 1F */
      53                 :   0x0020, 0x0021, 0x0022, 0x0023, 0x0024, 0x0025, 0x0026, 0x0027, /* 20, 27 */
      54                 :   0x0028, 0x0029, 0x002A, 0x002B, 0x002C, 0x002D, 0x002E, 0x002F, /* 28, 2F */
      55                 :   0x0030, 0x0031, 0x0032, 0x0033, 0x0034, 0x0035, 0x0036, 0x0037, /* 30, 37 */
      56                 :   0x0038, 0x0039, 0x003A, 0x003B, 0x003C, 0x003D, 0x003E, 0x003F, /* 38, 3F */
      57                 :   0x0040, 0x0041, 0x0042, 0x0043, 0x0044, 0x0045, 0x0046, 0x0047, /* 40, 47 */
      58                 :   0x0048, 0x0049, 0x004A, 0x004B, 0x004C, 0x004D, 0x004E, 0x004F, /* 48, 4F */
      59                 :   0x0050, 0x0051, 0x0052, 0x0053, 0x0054, 0x0055, 0x0056, 0x0057, /* 50, 57 */
      60                 :   0x0058, 0x0059, 0x005A, 0x005B, 0x005C, 0x005D, 0x005E, 0x005F, /* 58, 5F */
      61                 :   0x0060, 0x0061, 0x0062, 0x0063, 0x0064, 0x0065, 0x0066, 0x0067, /* 60, 67 */
      62                 :   0x0068, 0x0069, 0x006A, 0x006B, 0x006C, 0x006D, 0x006E, 0x006F, /* 68, 6F */
      63                 :   0x0070, 0x0071, 0x0072, 0x0073, 0x0074, 0x0075, 0x0076, 0x0077, /* 70, 77 */
      64                 :   0x0078, 0x0079, 0x007A, 0x007B, 0x007C, 0x007D, 0x007E, 0x0000, /* 78, 7F */
      65                 :   0x2022, 0x2020, 0x2021, 0x2026, 0x2014, 0x2013, 0x0192, 0x2044, /* 80, 87 */
      66                 :   0x2039, 0x203A, 0x2212, 0x2030, 0x201E, 0x201C, 0x201D, 0x2018, /* 88, 8F */
      67                 :   0x2019, 0x201A, 0x2122, 0xFB01, 0xFB02, 0x0141, 0x0152, 0x0160, /* 90, 97 */
      68                 :   0x0178, 0x017D, 0x0131, 0x0142, 0x0153, 0x0161, 0x017E, 0x0000, /* 98, 9F */
      69                 :   0x20AC, 0x00A1, 0x00A2, 0x00A3, 0x00A4, 0x00A5, 0x00A6, 0x00A7, /* A0, A7 */
      70                 :   0x00A8, 0x00A9, 0x00AA, 0x00AB, 0x00AC, 0x0000, 0x00AE, 0x00AF, /* A8, AF */
      71                 :   0x00B0, 0x00B1, 0x00B2, 0x00B3, 0x00B4, 0x00B5, 0x00B6, 0x00B7, /* B0, B7 */
      72                 :   0x00B8, 0x00B9, 0x00BA, 0x00BB, 0x00BC, 0x00BD, 0x00BE, 0x00BF, /* B8, BF */
      73                 :   0x00C0, 0x00C1, 0x00C2, 0x00C3, 0x00C4, 0x00C5, 0x00C6, 0x00C7, /* C0, C7 */
      74                 :   0x00C8, 0x00C9, 0x00CA, 0x00CB, 0x00CC, 0x00CD, 0x00CE, 0x00CF, /* C8, CF */
      75                 :   0x00D0, 0x00D1, 0x00D2, 0x00D3, 0x00D4, 0x00D5, 0x00D6, 0x00D7, /* D0, D7 */
      76                 :   0x00D8, 0x00D9, 0x00DA, 0x00DB, 0x00DC, 0x00DD, 0x00DE, 0x00DF, /* D8, DF */
      77                 :   0x00E0, 0x00E1, 0x00E2, 0x00E3, 0x00E4, 0x00E5, 0x00E6, 0x00E7, /* E0, E7 */
      78                 :   0x00E8, 0x00E9, 0x00EA, 0x00EB, 0x00EC, 0x00ED, 0x00EE, 0x00EF, /* E8, EF */
      79                 :   0x00F0, 0x00F1, 0x00F2, 0x00F3, 0x00F4, 0x00F5, 0x00F6, 0x00F7, /* F0, F7 */
      80                 :   0x00F8, 0x00F9, 0x00FA, 0x00FB, 0x00FC, 0x00FD, 0x00FE, 0x00FF  /* F8, FF */
      81                 : };
      82                 : 
      83                 : /* Definition of the interval type */
      84                 : struct pdfdocenc_map_interval_s {
      85                 :   pdf_u8_t interval_start;
      86                 :   pdf_u8_t interval_stop;
      87                 : };
      88                 : typedef struct pdfdocenc_map_interval_s pdfdocenc_map_interval_t;
      89                 : 
      90                 : /* Direct intervals */
      91                 : #define PDFDOCENC_MDI  10
      92                 : static const pdfdocenc_map_interval_t pdfdocenc_map_direct[PDFDOCENC_MDI] = {
      93                 :   { 0x09, 0x0A },
      94                 :   { 0x0D, 0x0D },
      95                 :   { 0x20, 0x7E },
      96                 :   { 0xA1, 0xAC },
      97                 :   { 0xAE, 0xFF }
      98                 : };
      99                 : 
     100                 : /* Indirect intervals */
     101                 : #define PDFDOCENC_MII  10
     102                 : static const pdfdocenc_map_interval_t pdfdocenc_map_indirect[PDFDOCENC_MII] = {
     103                 :   { 0x18, 0x1F },
     104                 :   { 0x80, 0x9E },
     105                 :   { 0xA0, 0xA0 }
     106                 : };
     107                 : 
     108                 : /* Undefined intervals, probably not really needed
     109                 : #define PDFDOCENC_MUI  10
     110                 : static const pdfdocenc_map_interval_t pdfdocenc_map_undefined[PDFDOCENC_MUI] = {
     111                 :   { 0x00, 0x08 },
     112                 :   { 0x0B, 0x0C },
     113                 :   { 0x0E, 0x17 },
     114                 :   { 0x7F, 0x7F },
     115                 :   { 0x9F, 0x9F },
     116                 :   { 0xAD, 0xAD }
     117                 : };
     118                 :   */
     119                 : 
     120                 : 
     121                 : /* Mapping of the first char in a UTF-8 character representation, which
     122                 :  *  determines the number of bytes that will be needed to represent the
     123                 :  *  character:
     124                 :  *     0xxx xxxx -> 1 byte  [00,7F]
     125                 :  *     110x xxxx -> 2 bytes [C0,DF]
     126                 :  *     1110 xxxx -> 3 bytes [E0,EF]
     127                 :  *     1111 0xxx -> 4 bytes [F0,F7]
     128                 :  *  Longer byte sequences are not allowed to represent Unicode points.
     129                 :  */
     130                 : static const unsigned char n_bytes_in_utf8_char [256] = {
     131                 :   1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, /* 00, 1F */
     132                 :   1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, /* 20, 3F */
     133                 :   1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, /* 40, 5F */
     134                 :   1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, /* 60, 7F */
     135                 :   0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, /* 80, 9F */
     136                 :   0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, /* A0, BF */
     137                 :   2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2, /* C0, DF */
     138                 :   3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,4,4,4,4,4,4,4,4,0,0,0,0,0,0,0,0  /* E0, FF */
     139                 : };
     140                 : 
     141                 : 
     142                 : /* UNICODE BOM bytes encoded in the different built-in UNICODE encodings */
     143                 : static const pdf_text_bom_t unicode_bom [PDF_TEXT_MAX_UNICODE_ENC] = {
     144                 :   { {239,187,191,  0}, 3 },    /* PDF_TEXT_UTF8 */
     145                 :   { {254,255,  0,  0}, 2 },    /* PDF_TEXT_UTF16_BE */
     146                 :   { {255,254,  0,  0}, 2 },    /* PDF_TEXT_UTF16_LE */
     147                 :   { {  0,  0,  0,  0}, 0 },    /* N/A (UTF-16 HE) */
     148                 :   { {  0,  0,254,255}, 4 },    /* PDF_TEXT_UTF32_BE */
     149                 :   { {255,254,  0,  0}, 4 },    /* PDF_TEXT_UTF32_LE */
     150                 :   { {  0,  0,  0,  0}, 0 }     /* N/A (UTF-32HE)  */
     151                 : };
     152                 : 
     153                 : 
     154                 : /******************** PDF Doc Encoding to UTF-32 conversion *******************/
     155                 : 
     156                 : static pdf_text_utf32_char_t
     157                 : pdf_text_pdfdocenc_point_to_utf32he_point(const pdf_char_t pdfdocenc_val)
     158                 : {
     159                 :   pdf_text_utf32_char_t utf32val;
     160            1130 :   utf32val.i =  pdfdocenc_map[(int)pdfdocenc_val];
     161            1130 :   return utf32val;
     162                 : }
     163                 : 
     164                 : /* Static function to convert from PDF Doc Encoding to UTF-32HE, lossless */
     165                 : pdf_status_t
     166                 : pdf_text_pdfdocenc_to_utf32he(const pdf_char_t    *input_data,
     167                 :                               const pdf_size_t    input_length,
     168                 :                               pdf_char_t          **p_output_data,
     169                 :                               pdf_size_t          *p_output_length)
     170             177 : {
     171                 :   /* Note: PDF Doc Encoding has always 8 bits per character.
     172                 :    *  This means that, if length of origin string is N bytes, the number of
     173                 :    *  required bytes for the UTF32 representation of the string is 4N.
     174                 :    *  (Each PDFDocEncoding byte is expanded to 4 bytes in UTF32. */
     175                 :   
     176                 :   pdf_size_t i;  /* index for the origin string data */
     177                 :   pdf_size_t j;  /* index for the destination string data */
     178                 :   pdf_char_t *data;
     179                 :   pdf_size_t new_length;
     180                 :   
     181                 :   /* Get new string length... */
     182             177 :   new_length = 4 * input_length;
     183                 :   /* Create destination string with correct size (but empty!) */
     184             177 :   data = (pdf_char_t *)pdf_alloc(new_length);
     185             177 :   if(data == NULL)
     186                 :     {
     187               0 :       return PDF_ENOMEM;
     188                 :     }
     189                 :   
     190            1295 :   for(i = 0, j = 0; i < input_length; i++, j+=4)
     191                 :     {
     192                 :       pdf_text_utf32_char_t utf32val;
     193                 :       /* Get value... */
     194            2260 :       utf32val = pdf_text_pdfdocenc_point_to_utf32he_point(input_data[i]);
     195            1130 :       if(utf32val.i == 0)
     196                 :         {
     197                 :           /* Oops, the given input byte is UNDEFINED in PDF Doc Encoding */
     198              12 :           pdf_dealloc(data);
     199              12 :           return PDF_EBADDATA;
     200                 :         }
     201                 :       /* Copy converted value to output */
     202            1118 :       memcpy(&(data[j]), &(utf32val), 4);
     203                 :     }
     204                 :   
     205                 :   /* Everything went ok, set output data */
     206             165 :   *p_output_data = data;
     207             165 :   *p_output_length = new_length;
     208                 :   
     209             165 :   return PDF_OK;
     210                 : }
     211                 : 
     212                 : 
     213                 : /******************** UTF-32 to PDF Doc Encoding conversion *******************/
     214                 : 
     215                 : static pdf_char_t
     216                 : pdf_text_utf32he_point_to_pdfdocenc_point(const pdf_text_utf32_char_t utf32val)
     217                 : {
     218                 :   
     219                 :   pdf_u8_t i;
     220                 : 
     221                 :   
     222                 :   /* If the given UTF-32 point is encoded in a single byte, then direct
     223                 :    *  conversion is possible */
     224             805 :   if(utf32val.i <= 0xFF)
     225                 :     {
     226                 :       /* Check if direct conversion is possible */
     227            2409 :       for(i=0; i<PDFDOCENC_MDI; ++i)
     228                 :         {
     229            2409 :           if((utf32val.i <= pdfdocenc_map_direct[i].interval_stop) && \
     230                 :              (utf32val.i >= pdfdocenc_map_direct[i].interval_start))
     231                 :             {
     232                 :               /* If the unicode char is among this intervals, direct conversion
     233                 :                * is possible (single byte!) */
     234             803 :               return (pdf_char_t)utf32val.i;
     235                 :             }
     236                 :         }
     237                 :     }
     238                 :   else
     239                 :     {
     240                 :       /* Check if indirect conversion is possible */
     241              22 :       for(i=0; i<PDFDOCENC_MII; ++i)
     242                 :         {
     243                 :           
     244                 :           /* Simple search in the interval */
     245              20 :           pdf_size_t search_index = pdfdocenc_map_indirect[i].interval_start;
     246             114 :           while((search_index <= pdfdocenc_map_indirect[i].interval_stop) )
     247                 :             {
     248              94 :               if(pdfdocenc_map[search_index] == utf32val.i)
     249                 :                 {
     250                 :                   /* Directly apply search index as character */
     251               0 :                   return (pdf_char_t) search_index;
     252                 :                 }
     253              94 :               search_index++;
     254                 :             }
     255                 :         }
     256                 :     }
     257                 : 
     258                 :   /* If neither Direct conversion nor Indirect conversion are available,
     259                 :    *  the given character is UNDEFINED. Set default character when there is no 
     260                 :    *  direct mapping to PDF Doc Encoding. This means that every conversion
     261                 :    *  to PDF Doc Encoding will NEVER fail if there is no mapping of a code point
     262                 :    *  in PDF Doc Encoding */
     263               2 :   return (pdf_char_t) '?';
     264                 : }
     265                 : 
     266                 : 
     267                 : /* Static function to convert from UTF-32HE to PDF Doc Encoding, with loss of
     268                 :  information */
     269                 : pdf_status_t
     270                 : pdf_text_utf32he_to_pdfdocenc(const pdf_char_t    *input_data,
     271                 :                               const pdf_size_t    input_length,
     272                 :                               pdf_char_t          **p_output_data,
     273                 :                               pdf_size_t          *p_output_length)
     274             119 : {
     275                 :   /* Note: UTF-32 has always 32 bits per character.
     276                 :    This means that, if length of origin string is 4N bytes, the number of
     277                 :    required bytes for the PDFDocEncoding representation of the string is N.
     278                 :    (Each UNICODE 4-byte character is represented as 1-byte character in
     279                 :    PDFDocEncoding). This means that LOSS of information could happen */
     280                 :   
     281                 :   int i;  /* index for the origin string data */
     282                 :   int j;  /* index for the destination string data */
     283                 :   
     284                 :   /* Check if the length of the origin string is multiple of 4 bytes */
     285             119 :   if(input_length % 4 != 0)
     286                 :     {
     287                 :       PDF_DEBUG_BASE("Input length must be multiple of 4! Invalid UTF-32 data."
     288                 :                      "(Length: %d)", (int)input_length);
     289               0 :       return PDF_EBADDATA;
     290                 :     }
     291                 :   
     292                 :   /* Get new string length... */
     293             119 :   *p_output_length = input_length / 4;
     294                 :   
     295                 :   /* Create destination string with correct size (but empty!) */
     296             119 :   *p_output_data = (pdf_char_t *)pdf_alloc(*p_output_length);
     297             119 :   if(*p_output_data == NULL)
     298                 :     {
     299               0 :       return PDF_ENOMEM;
     300                 :     }
     301                 :   
     302             924 :   for(i = 0, j = 0; i < input_length; i+=4, j++)
     303                 :     {
     304                 :       pdf_text_utf32_char_t   utf32val; /* UNICODE char */
     305                 :       /* Get UTF-32 char to convert */
     306             805 :       memcpy(&utf32val, &input_data[i], 4);
     307                 :       /* Convert character to PDF Doc Encoding */
     308            1610 :       (*p_output_data)[j] = pdf_text_utf32he_point_to_pdfdocenc_point(utf32val);
     309                 :     }
     310                 :   
     311             119 :   return PDF_OK;
     312                 : }
     313                 : 
     314                 : 
     315                 : /*********************** UTF-32 to UTF-32 conversions *************************/
     316                 : 
     317                 : /* Function to convert from UTF32-HE to UTF32-HE, lossless */
     318                 : pdf_status_t
     319                 : pdf_text_utf32he_to_utf32he(const pdf_char_t    *input_data,
     320                 :                             const pdf_size_t    input_length,
     321                 :                             const pdf_bool_t    swap,
     322                 :                             const pdf_bool_t    check_input_he,
     323                 :                             const pdf_bool_t    check_output_he,
     324                 :                             pdf_char_t          **p_output_data,
     325                 :                             pdf_size_t          *p_output_length)
     326             744 : {
     327                 :   pdf_size_t walker;
     328             744 :   pdf_size_t bom_bytes = 0;
     329             744 :   pdf_char_t *new_data = NULL;
     330             744 :   pdf_size_t new_size = 0;
     331                 : 
     332             744 :   if(input_length % 4 != 0)
     333                 :     {
     334                 :       /* Invalid number of bytes! */
     335                 :       PDF_DEBUG_BASE("Input length must be multiple of 4! Invalid UTF-32 data."
     336                 :                      " (Length: %d)", (int)input_length);
     337              45 :       return PDF_EBADDATA;
     338                 :     }
     339                 :   
     340                 :   /* Check if BOM is present... and skip it if so */
     341             699 :   if(pdf_text_check_unicode_bom (input_data, input_length,
     342                 :                                  PDF_TEXT_UTF32_HE, swap))
     343                 :     {
     344                 :       /* Skip BOM */
     345              30 :       bom_bytes = 4;
     346                 :     }
     347                 :   
     348                 :   /* Allocate memory */
     349             699 :   new_size = input_length - bom_bytes;
     350                 :   /* Create destination string with correct size (but empty!) */
     351             699 :   new_data = (pdf_char_t *)pdf_alloc(new_size);
     352             699 :   if(new_data == NULL)
     353                 :     {
     354               0 :       return PDF_ENOMEM;
     355                 :     }
     356                 : 
     357                 :   /* Change endianness of each 32bit value... */
     358            8795 :   for(walker = bom_bytes; walker < input_length; walker+=4)
     359                 :     {
     360                 :       pdf_text_utf32_char_t utf32val;
     361            8096 :       memcpy(&utf32val, &input_data[walker], 4);
     362                 :       
     363                 :       /* Check code point validity (if the input is Host Endian) */
     364                 :       /* Code point must not be a surrogate code unit, and must be in the
     365                 :        * U+00000000 - U+0010FFFF range */
     366            8096 :       if(check_input_he)
     367                 :         {
     368            3368 :           if((utf32val.i > 0x10FFFF) || \
     369                 :               ((utf32val.i >= 0xD800) && \
     370                 :                (utf32val.i <= 0xDFFF)))
     371                 :             {
     372                 :               /* Invalid UTF-32 code point received */
     373                 :               PDF_DEBUG_BASE("Invalid input UTF-32HE code point: "
     374                 :                              "%.2X:%.2X:%.2X:%.2X",
     375                 :                              utf32val.c[0],
     376                 :                              utf32val.c[1],
     377                 :                              utf32val.c[2],
     378                 :                              utf32val.c[3]);
     379               0 :               return PDF_EBADTEXT;
     380                 :             }
     381                 :         }
     382                 :       
     383                 :       /* Swap bytes */
     384            8096 :       if(swap)
     385                 :         {
     386            4770 :           utf32val.i = PDF_TEXT_CHANGE_ENDIANNESS_32BIT(utf32val.i);
     387                 :         }
     388                 :       
     389                 :       /* Check code point validity (if the output is Host Endian) */
     390                 :       /* Code point must not be a surrogate code unit, and must be in the
     391                 :        * U+00000000 - U+0010FFFF range */
     392            8096 :       if(check_output_he)
     393                 :         {
     394            4728 :           if((utf32val.i > 0x10FFFF) || \
     395                 :              ((utf32val.i >= 0xD800) && \
     396                 :               (utf32val.i <= 0xDFFF)))
     397                 :             {
     398                 :               /* Invalid UTF-32 code point received */
     399                 :               PDF_DEBUG_BASE("Invalid output UTF-32HE code point: "
     400                 :                              "%.2X:%.2X:%.2X:%.2X",
     401                 :                              utf32val.c[0],
     402                 :                              utf32val.c[1],
     403                 :                              utf32val.c[2],
     404                 :                              utf32val.c[3]);
     405               0 :               return PDF_EBADTEXT;
     406                 :             }
     407                 :         }
     408                 : 
     409                 :       /* Copy value */
     410            8096 :       memcpy(&(new_data[walker-bom_bytes]), &utf32val, 4);
     411                 :     }
     412                 :   
     413                 :   /* Really set output data */
     414             699 :   *p_output_data = new_data;
     415             699 :   *p_output_length = new_size;
     416                 :   
     417             699 :   return PDF_OK;
     418                 : }
     419                 : 
     420                 : 
     421                 : /*********************** UTF-16 to UTF-32 conversions *************************/
     422                 : 
     423                 : /* Static function to convert a given UTF-16HE character (with one or two words)
     424                 :  *  to UTF-32HE. The number of bytes (2 or 4) used from the input UTF-16BE point
     425                 :  *  is returned (or 0 if the UTF-16HE point is not valid */
     426                 : static pdf_size_t
     427                 : pdf_text_utf16he_point_to_utf32he_point(pdf_text_utf16_char_t utf16val[2],
     428                 :                                         pdf_text_utf32_char_t *p_utf32val)
     429                 : {
     430                 :   pdf_size_t n_bytes;
     431                 :   
     432                 :   /* Ok, so how can we know if the UTF16 character is encoded using 2 or
     433                 :    * 4 bytes? A surrogate pair consists on two 16-bit values of the 
     434                 :    * UTF16 encoding. Each word (16bit-value) within the surrogate pair 
     435                 :    * doesn't represent a valid character, as it is enclosed in the 
     436                 :    * following interval: U+D800 - U+DFFF. This means that if the first 
     437                 :    * word analysed is outside this interval, it will be treated 
     438                 :    * separately. If the first word is within this interval, it is 
     439                 :    * expected to have the second word within the interval as well. If 
     440                 :    * this doesn't happen it will be treated as a badly formatted UTF16
     441                 :    * string. In fact, there are two different intervals within the surrogate
     442                 :    * points themselves: the High surrogate point will be in the U+D800 - 
     443                 :    * U+DBFF interval, and the Low surrogate point will be in the U+DC00 - 
     444                 :    * U+DFFF interval. */
     445                 :   
     446            1339 :   if(((utf16val[0].i) >= 0xD800) && \
     447                 :      ((utf16val[0].i) <= 0xDFFF))
     448                 :     {
     449                 :       /* To have a valid surrogate pair, the first UTF-16 value must be the High
     450                 :        *  surrogate code unit, and the second UTF-16 value must be the Low
     451                 :        *  surrogate code unit. */
     452              40 :       if(((utf16val[0].i) <= 0xDFFF) && \
     453                 :          ((utf16val[1].i) >= 0xDC00) && \
     454                 :          ((utf16val[1].i) <= 0xDFFF))
     455                 :         {
     456                 :           /* Yes, second word is within the validity interval, it seems a
     457                 :            * correct 32-bit representation of a character in UTF16BE */
     458              40 :           n_bytes = 4;
     459              40 :           (*p_utf32val).i = 0x10000 + \
     460                 :                             (((utf16val[0].i) - 0xD800) << 10) + \
     461                 :                             ((utf16val[1].i) - 0xDC00);
     462                 :         }
     463                 :       /* else Oops, invalid UTF-16HE surrogate pair! Input data is not well
     464                 :        * formed... */
     465                 :       else
     466                 :         {
     467                 :           PDF_DEBUG_BASE("Invalid UTF-16HE point! %.2X:%.2X:%.2X:%.2X",
     468                 :                          utf16val[0].c[0], utf16val[0].c[1],
     469                 :                          utf16val[1].c[0], utf16val[1].c[1]);
     470               0 :           n_bytes = 0;
     471                 :         }
     472                 :     }
     473                 :   else
     474                 :     {
     475                 :       /* No multiword representation, just 16bits for this character
     476                 :        * So conversion is direct... */
     477            1299 :       n_bytes = 2;
     478            1299 :       (*p_utf32val).i = (utf16val[0]).i;
     479                 :     }
     480            1339 :   return n_bytes;
     481                 : }
     482                 : 
     483                 : 
     484                 : /* Function to convert from UTF16-HE to UTF32-HE, lossless */
     485                 : pdf_status_t
     486                 : pdf_text_utf16he_to_utf32he(const pdf_char_t    *input_data,
     487                 :                             const pdf_size_t    input_length,
     488                 :                             const pdf_bool_t    swap,
     489                 :                             pdf_char_t          **p_output_data,
     490                 :                             pdf_size_t          *p_output_length,
     491                 :                             pdf_char_t          **p_remaining_data,
     492                 :                             pdf_size_t          *p_remaining_length)
     493             148 : {
     494                 :   /* Note: UTF-16 has either 16 or 32 bits per character.
     495                 :    *  This means that, if length of origin string is N bytes, the number of
     496                 :    *  required bytes for the UTF-32 representation of the string is 2N in
     497                 :    *  the worst case (in the case of having all the UTF-16 characters encoded
     498                 :    *  with 16bits).
     499                 :    *  (Each UTF-16 is expanded to 4 bytes in UTF-32. */
     500                 :   
     501                 :   pdf_char_t *data;
     502                 :   pdf_size_t new_string_length;
     503                 :   pdf_size_t new_string_length_worst;
     504                 :   pdf_size_t delta_in_utf16be;
     505                 :   int i;  /* index for the origin string data */
     506                 :   int j;  /* index for the destination string data */
     507                 :   pdf_text_utf16_char_t utf16val[2];
     508                 :   pdf_text_utf32_char_t utf32val;
     509             148 :   short stop_conversion = PDF_FALSE;
     510             148 :   short check_lang_code = PDF_FALSE;
     511             148 :   int bom_bytes = 0;
     512                 :   
     513                 :   /* Check if length is multiple of 2 (data must come in pairs of bytes!) */
     514             148 :   if((input_length < 2) || \
     515                 :      (input_length % 2) != 0)
     516                 :     {
     517                 :       PDF_DEBUG_BASE("Input length must be multiple of 2 and greater than 2!"
     518                 :                      " Invalid UTF-16 data. (Length: %d)", (int)input_length);
     519              45 :       return PDF_EBADDATA;
     520                 :     }
     521                 :   
     522                 :   /* Check if BOM is present... and skip it if so */
     523             103 :   if(pdf_text_check_unicode_bom (input_data, input_length,
     524                 :                                  PDF_TEXT_UTF16_HE, swap))
     525                 :     {
     526                 :       /* Skip BOM */
     527              35 :       bom_bytes = 2;
     528                 :     }
     529                 :   
     530                 :   /* Get new string worst length... (don't consider BOM bytes) */
     531             103 :   new_string_length_worst = 2 * (input_length - bom_bytes);
     532                 :   /* Create destination string with worst size (but empty!) */
     533             103 :   data = (pdf_char_t *)pdf_alloc(new_string_length_worst);
     534             103 :   if(data == NULL)
     535                 :     {
     536               0 :       return PDF_ENOMEM;
     537                 :     }
     538                 :   
     539                 :   /* Initiate final string length */
     540             103 :   new_string_length = 0;
     541                 :   
     542                 :   /* Initiate indexes */
     543             103 :   i = bom_bytes;  /* Skipping BOM if present... */
     544             103 :   j = 0;
     545                 :   
     546                 :   /* Check if specific country/language could be found */
     547             103 :   if((p_remaining_length != NULL) && \
     548                 :      (p_remaining_data != NULL))
     549                 :     {
     550              33 :       check_lang_code = PDF_TRUE;
     551                 :     }
     552                 :   
     553                 :   /* This while loop will be done until the end of the input data OR until
     554                 :    *  the moment a new country/language code identifier is found. But, this
     555                 :    *  extra stop condition will only be available if valid `p_remaining_data'
     556                 :    *  and `p_remaining_length' pointers are given as input. */
     557            1454 :   while((i < input_length) && \
     558                 :         (!stop_conversion))
     559                 :     {
     560            1363 :       if((check_lang_code) && \
     561                 :          (input_data[i+1] == PDF_TEXT_LCI_1) && \
     562                 :          (input_data[i] == PDF_TEXT_LCI_0))
     563                 :         {
     564                 :           /* Stop conversion... due to new lang/code initializer */
     565              12 :           stop_conversion = PDF_TRUE;
     566                 :           /* Set the output remaining data... */
     567              12 :           *p_remaining_length = input_length - i;
     568              12 :           *p_remaining_data = (pdf_char_t *) &input_data[i];
     569                 :         }
     570                 :       else
     571                 :         {
     572                 :           /* Store the UTF-16(BE/LE) data in the intermediate variable */
     573            1339 :           utf16val[0].c[0] = input_data[i];
     574            1339 :           utf16val[0].c[1] = input_data[i+1];
     575            1339 :           if((i+3) < input_length)
     576                 :             {
     577            1286 :               utf16val[1].c[0] = input_data[i+2];
     578            1286 :               utf16val[1].c[1] = input_data[i+3];
     579                 :             }
     580                 :           /* else, last point should be only 1-word length */
     581                 :           else
     582                 :             {
     583              53 :               utf16val[1].c[0] = 0x00;
     584              53 :               utf16val[1].c[1] = 0x00;
     585                 :             }
     586                 :           
     587            1339 :           if(swap)
     588                 :             {
     589                 :               /* Input data must be swapped in order to convert it to
     590                 :                *  host endian */
     591             835 :               utf16val[0].i = PDF_TEXT_CHANGE_ENDIANNESS_16BIT(utf16val[0].i);
     592             835 :               utf16val[1].i = PDF_TEXT_CHANGE_ENDIANNESS_16BIT(utf16val[1].i);
     593                 :             }
     594                 :           
     595                 :           /* Change UTF-16HE point to UTF-32HE point */
     596            1339 :           delta_in_utf16be = pdf_text_utf16he_point_to_utf32he_point(utf16val,
     597                 :                                                                      &utf32val);
     598            1339 :           if(delta_in_utf16be == 0)
     599                 :             {
     600                 :               /* Oops, invalid UTF-16HE point found! */
     601               0 :               pdf_dealloc(data);
     602                 :               PDF_DEBUG_BASE("Conversion from UTF-16 to UTF-32HE stopped");
     603               0 :               return PDF_EBADTEXT;
     604                 :             }
     605                 :           
     606                 :           /* Finally, store the UTF-32 representation of the char in the output
     607                 :            * string... */
     608            1339 :           data[j] = utf32val.c[0];
     609            1339 :           data[j+1] = utf32val.c[1];
     610            1339 :           data[j+2] = utf32val.c[2];
     611            1339 :           data[j+3] = utf32val.c[3];
     612                 :           
     613                 :           /* Update final string length after having added this character */
     614            1339 :           new_string_length+=4;
     615                 :           
     616                 :           /* Update indexes */
     617            1339 :           i += delta_in_utf16be;
     618            1339 :           j += 4;
     619                 :         }
     620                 :     }
     621                 :   
     622                 :   /* Everything went ok, set output data */
     623             103 :   *p_output_data = data;
     624                 :   /* Set output length... */
     625             103 :   *p_output_length = new_string_length;
     626                 :   
     627                 :   /* Check if the stop flag was set due to finding lang/country code
     628                 :    *  initializer. If not found, set zero remaining length and NULL
     629                 :    *  remaining str */
     630             103 :   if((!stop_conversion) && \
     631                 :      (p_remaining_length != NULL) && \
     632                 :      (p_remaining_data != NULL))
     633                 :     {
     634              21 :       *p_remaining_length = 0;
     635              21 :       *p_remaining_data = NULL;
     636                 :     }
     637                 :   
     638                 :   /* Now, if the real output string length is not equal to the worst string
     639                 :    * length, we will reallocate memory for the correct size. This will only
     640                 :    * happen when at least one character is not encoded with 32bits in UTF-16. */
     641             103 :   if(new_string_length != new_string_length_worst)
     642                 :     {
     643                 :       /* Recreate object with correct size... */
     644              50 :       *p_output_data = (pdf_char_t *)pdf_realloc(*p_output_data,
     645                 :                                                  new_string_length);
     646              50 :       if(*p_output_data == NULL)
     647                 :         {
     648               0 :           return PDF_ENOMEM;
     649                 :         }
     650                 :     }  
     651             103 :   return PDF_OK;
     652                 : }
     653                 : 
     654                 : 
     655                 : 
     656                 : /*********************** UTF-32 to UTF-16 conversions *************************/
     657                 : 
     658                 : 
     659                 : /* Static function to convert a given UTF-32HE character to UTF-16HE. The number
     660                 :  * of bytes used in the output UTF-16HE point is returned (or 0
     661                 :  *  if the UTF-16HE point is not valid */
     662                 : static pdf_size_t
     663                 : pdf_text_utf32he_point_to_utf16he_point(pdf_text_utf32_char_t utf32val,
     664                 :                                         pdf_text_utf16_char_t utf16val[2])
     665                 : {
     666                 :   pdf_size_t n_bytes;
     667                 : 
     668            1008 :   if((utf32val.i >= 0xD800) && \
     669                 :      (utf32val.i <= 0xDFFF))
     670                 :     {
     671                 :       PDF_DEBUG_BASE("Invalid UTF-32HE point (surrogate pair found)! "
     672                 :                      "%.2X:%.2X:%.2X:%.2X",
     673                 :                      utf32val.c[0], utf32val.c[1],
     674                 :                      utf32val.c[2], utf32val.c[3]);
     675               0 :       n_bytes = 0;
     676                 :     }
     677                 :   /* Check if multiword (32bits) representation is needed */
     678            1008 :   else if( utf32val.i >= 0x10000 )
     679                 :     {
     680                 :       /* Ok so it seems a multiword representation...
     681                 :        * Now check input UTF-32HE representation to see if it really is a
     682                 :        *  Unicode point (from 0x00000 to 0x10FFFF) */
     683              32 :       if (utf32val.i <= 0x10FFFF)
     684                 :         {
     685                 :           /* 32 bits are required for this char */
     686              32 :           n_bytes = 4;
     687              32 :           utf32val.i -= 0x10000;
     688                 :           /* Process higher 10 bits, by shifting to the right 10 bits */
     689              32 :           (utf16val[0]).i = (utf32val.i >> 10) + 0xD800;
     690                 :           /* Process lower 10 bits, by masking the value with 0x03FF */  
     691              32 :           (utf16val[1]).i = (utf32val.i & 0x03FF) + 0xDC00;
     692                 :         }
     693                 :       else
     694                 :         {
     695                 :           /* else Oops, invalid 32-bit character! Input data is not well
     696                 :            * formed... */
     697                 :           PDF_DEBUG_BASE("Invalid UTF-32HE point! %.2X:%.2X:%.2X:%.2X",
     698                 :                          utf32val.c[0], utf32val.c[1],
     699                 :                          utf32val.c[2], utf32val.c[3]);
     700               0 :           n_bytes = 0;
     701                 :         }
     702                 :     }
     703                 :   else
     704                 :     {
     705             976 :       n_bytes = 2;
     706                 :       /* No multiword representation, just 16bits for this character
     707                 :        * So conversion is direct.... */
     708             976 :       (utf16val[0]).i = utf32val.i;
     709             976 :       (utf16val[1]).i = 0x0000;
     710                 :     }
     711            1008 :   return n_bytes;
     712                 : }
     713                 : 
     714                 : 
     715                 : /*  Function to convert from UTF-32HE to UTF-16, lossless */
     716                 : pdf_status_t
     717                 : pdf_text_utf32he_to_utf16he(const pdf_char_t *input_data,
     718                 :                             const pdf_size_t input_length,
     719                 :                             pdf_char_t       **p_output_data,
     720                 :                             pdf_size_t       *p_output_length,
     721                 :                             pdf_bool_t       swap)
     722              80 : {
     723                 :   /* Note: UTF-16BE has either 16 or 32 bits per character.
     724                 :    This means that, if length of origin string is 4N bytes, the number of
     725                 :    required bytes for the UTF16BE representation of the string is 4N in
     726                 :    the worst case. (When all the UTF16be representations have 32bits)
     727                 :    */
     728                 :   pdf_size_t new_string_length;
     729                 :   pdf_size_t new_string_length_worst;
     730                 :   pdf_size_t delta_in_utf16be;
     731                 :   int i;  /* index for the origin string data */
     732                 :   int j;  /* index for the destination string data */
     733                 :   pdf_text_utf16_char_t utf16val[2];
     734                 :   pdf_text_utf32_char_t utf32val;
     735                 :   pdf_char_t *data;
     736                 :   
     737                 :   /* Get new string length (worst case)... */
     738              80 :   new_string_length_worst = input_length;
     739                 :   /* Create destination string with correct size (but empty!) */
     740              80 :   data = (pdf_char_t *)pdf_alloc(new_string_length_worst);
     741              80 :   if(data == NULL)
     742                 :     {
     743               0 :       return PDF_ENOMEM;
     744                 :     }
     745                 :   
     746                 :   /* Initiate real string length, without considering marker bytes */
     747              80 :   new_string_length = 0;
     748                 :   
     749            1088 :   for( i = 0, j = 0; i < input_length; i += 4, j += delta_in_utf16be )
     750                 :     {
     751                 :       /* Get UCS4 char, as a direct memory copy from the input array */
     752            1008 :       memcpy(&utf32val, &(input_data[i]), 4);
     753                 :       
     754            2016 :       delta_in_utf16be = pdf_text_utf32he_point_to_utf16he_point(utf32val,
     755                 :                                                                  utf16val);
     756                 : 
     757            1008 :       if(delta_in_utf16be == 0)
     758                 :         {
     759                 :           /* Oops, invalid UTF-16HE point found! */
     760               0 :           pdf_dealloc(data);
     761                 :           PDF_DEBUG_BASE("Conversion from UTF-32HE to UTF-16 stopped");
     762               0 :           return PDF_EBADTEXT;
     763                 :         }
     764                 :       
     765                 :       /* Change endianness of each output word if required */
     766            1008 :       if(swap)
     767                 :         {
     768                 :           /* Change to BE */
     769             756 :           (utf16val[0]).i = PDF_TEXT_CHANGE_ENDIANNESS_16BIT((utf16val[0]).i);
     770             756 :           (utf16val[1]).i = PDF_TEXT_CHANGE_ENDIANNESS_16BIT((utf16val[1]).i);
     771                 :         }
     772                 :       
     773                 :       /* Finally, store the UTF16BE representation of the char in the output
     774                 :        * string... */
     775            1008 :       memcpy(&(data[j]), &utf16val[0], delta_in_utf16be);
     776                 :       /* Update new string legth... */
     777            1008 :       new_string_length += delta_in_utf16be;
     778                 :     }
     779                 :   
     780                 :   /* If everything went ok, set output data */
     781              80 :   *p_output_data = data;
     782                 :   /* Set final output length of the generated string */
     783              80 :   *p_output_length = new_string_length;
     784                 :   
     785                 :   /* If the real required string length is not equal to the initial worst length
     786                 :    * then update string with correct length. */
     787              80 :   if(new_string_length != new_string_length_worst)
     788                 :     {
     789                 :       /* Recreate object with correct smaller size... */
     790              80 :       *p_output_data = (pdf_char_t *)pdf_realloc(*p_output_data,
     791                 :                                                  new_string_length);
     792              80 :       if(*p_output_data == NULL)
     793                 :         {
     794               0 :           return PDF_ENOMEM;
     795                 :         }
     796                 :     }
     797                 :   
     798              80 :   return PDF_OK;
     799                 : }
     800                 : 
     801                 : 
     802                 : /************************ UTF-8 to UTF-32 conversions *************************/
     803                 : 
     804                 : /* Static function to convert a given UTF-8 character to UTF-32HE. The number
     805                 :  * of bytes used in the input UTF-8 point is returned (or 0 if the UTF-8 point
     806                 :  * is not valid */
     807                 : static pdf_size_t
     808                 : pdf_text_utf8_point_to_utf32he_point(const pdf_text_utf8_char_t utf8val[4],
     809                 :                                      const pdf_size_t n_bytes,
     810                 :                                      pdf_text_utf32_char_t *p_utf32val)
     811                 : {
     812                 :   int c;  /* index for the utf-8 representation of every char */
     813                 : 
     814                 :   /* Check validity of the UTF-8 bytes:
     815                 :    *  - First byte can be neither 0xFF nor 0xFE
     816                 :    *  - The following bytes must be in the [80-BF] range! (10xxxxxx) */
     817            1982 :   for(c=0; c<n_bytes; c++)
     818                 :     {
     819            1027 :       if(((c == 0) && ((utf8val[0] == 0xFF) || (utf8val[0] == 0xFE))) || \
     820                 :          ((c != 0) && ((utf8val[c]  < 0x80) || (utf8val[c]  > 0xBF))))
     821                 :         {
     822                 :           PDF_DEBUG_BASE("Invalid UTF-8 character: %.2X:%.2X:%.2X:%.2X",
     823                 :                          (int)utf8val[0],
     824                 :                          ((n_bytes>1)?((int)utf8val[1]):0),
     825                 :                          ((n_bytes>2)?((int)utf8val[2]):0),
     826                 :                          ((n_bytes>3)?((int)utf8val[3]):0));
     827              10 :           return 0;
     828                 :         }
     829                 :     }
     830                 :   
     831                 :   /* Load all the bytes of the UTF-8 representation in the UTF-32HE var */
     832             955 :   switch(n_bytes)
     833                 :   {
     834                 :     case 1:
     835             933 :       (*p_utf32val).i = (utf8val[0] & 0x7F);            /* 0111 1111 */
     836                 :       break;
     837                 :     case 2:
     838              10 :       (*p_utf32val).i = ((utf8val[0] & 0x1F) << 6) +    /* 0001 1111 */
     839                 :                         (utf8val[1] & 0x3F);          /* 0011 1111 */
     840                 :       break;
     841                 :     case 3:
     842               4 :       (*p_utf32val).i = ((utf8val[0] & 0x0F) << 12) +   /* 0000 1111 */
     843                 :                         ((utf8val[1] & 0x3F) << 6) +  /* 0011 1111 */
     844                 :                         (utf8val[2] & 0x3F);          /* 0011 1111 */
     845                 :       break;
     846                 :     case 4:
     847               8 :       (*p_utf32val).i = ((utf8val[0] & 0x07) << 18) +   /* 0000 1111 */
     848                 :                         ((utf8val[1] & 0x3F) << 12) +  /* 0000 1111 */
     849                 :                         ((utf8val[2] & 0x3F) << 6) +   /* 0011 1111 */
     850                 :                         (utf8val[3] & 0x3F);           /* 0011 1111 */
     851                 :       break;
     852                 :     default:
     853                 :       /* Should never happen! */
     854               0 :       return 0;
     855                 :   }
     856                 :   
     857             955 :   return n_bytes;
     858                 : }
     859                 : 
     860                 : /* Function to convert from UTF-8 to UTF-32HE, lossless */
     861                 : pdf_status_t
     862                 : pdf_text_utf8_to_utf32he(const pdf_char_t    *input_data,
     863                 :                          const pdf_size_t    input_length,
     864                 :                          pdf_char_t          **p_output_data,
     865                 :                          pdf_size_t          *p_output_length)
     866             129 : {
     867                 :   /* Note: PDF Doc Encoding has always 8 bits per character.
     868                 :    *  This means that, if length of origin string is N bytes, the number of
     869                 :    *  required bytes for the UTF32 representation of the string is 4N.
     870                 :    *  (Each PDFDocEncoding byte is expanded to 4 bytes in UTF32. */
     871                 :   pdf_size_t new_string_length;
     872                 :   pdf_size_t new_string_length_worst;
     873                 :   pdf_size_t bom_bytes;
     874                 :   int i;  /* index for the origin string data */
     875                 :   int j;  /* index for the destination string data */
     876                 :   pdf_size_t delta_in_utf8;
     877                 : 
     878                 :   pdf_char_t *data;
     879                 :   
     880                 :   /* Check if BOM is present... and skip it if so */
     881             129 :   bom_bytes = 0;
     882             129 :   if(pdf_text_check_unicode_bom (input_data, input_length, PDF_TEXT_UTF8, 0))
     883                 :     {
     884                 :       /* Skip BOM in UTF-8 */
     885              10 :       bom_bytes = 3;
     886                 :     }
     887                 :   
     888                 :   /* Get new string length... */
     889             129 :   new_string_length_worst = 4 * (input_length - bom_bytes);
     890                 :   
     891                 :   /* Create destination string with worst size (but empty!) */
     892             129 :   data = (pdf_char_t *)pdf_alloc(new_string_length_worst);
     893             129 :   if(data == NULL)
     894                 :     {
     895               0 :       return PDF_ENOMEM;
     896                 :     }
     897                 :   
     898             129 :   new_string_length = 0;
     899            1084 :   for(i = bom_bytes, j = 0; i < input_length; i+=delta_in_utf8, j+=4)
     900                 :     {
     901                 :       pdf_text_utf32_char_t utf32val;
     902                 :       pdf_text_utf8_char_t utf8val[4];
     903                 : 
     904                 :       /* Check number of bytes needed for the UTF-8 char */
     905             985 :       delta_in_utf8 = n_bytes_in_utf8_char[(int)input_data[i]];
     906                 :       
     907                 :       /* Check validity of first byte in UTF-8 */
     908                 :       /* Check if the required bytes are outside the input data stream */
     909             985 :       if((delta_in_utf8 == 0) || \
     910                 :          ((input_length - i) < delta_in_utf8))
     911                 :         {
     912                 :           PDF_DEBUG_BASE("Wrong UTF-8 data received (UTF-8 length: %d, "
     913                 :                          "Remaining length: %d", delta_in_utf8,
     914                 :                          (input_length - i));
     915              20 :           pdf_dealloc(data);
     916              20 :           return PDF_EBADDATA;
     917                 :         }
     918                 :       
     919                 :       /* Store data in intermediate UTF-8 variable */
     920             965 :       memcpy(&utf8val[0], &input_data[i], delta_in_utf8);
     921                 :       
     922             965 :       if(pdf_text_utf8_point_to_utf32he_point(utf8val,
     923                 :                                               delta_in_utf8,
     924                 :                                               &utf32val) == 0)
     925                 :         {
     926                 :           PDF_DEBUG_BASE("Problem decoding UTF-8 string");
     927              10 :           pdf_dealloc(data);
     928              10 :           return PDF_EBADDATA;
     929                 :         }
     930                 : 
     931                 :       /* Copy converted value (in UTF-32HE) to output */
     932             955 :       memcpy(&(data[j]), &(utf32val), 4);
     933                 :       
     934                 :       /* Update new string length */
     935             955 :       new_string_length += 4;
     936                 :     }
     937                 :   
     938                 :   /* If everything went ok, set output data */
     939              99 :   *p_output_data = data;
     940                 :   /* Set final output length of the generated string */
     941              99 :   *p_output_length = new_string_length;
     942                 :   
     943                 :   /* If the real required string length is not equal to the initial worst length
     944                 :    * then update string with correct length. */
     945              99 :   if(new_string_length != new_string_length_worst)
     946                 :     {
     947                 :       /* Recreate object with correct smaller size... */
     948              14 :       *p_output_data = (pdf_char_t *)pdf_realloc(*p_output_data,
     949                 :                                                  new_string_length);
     950              14 :       if(*p_output_data == NULL)
     951                 :         {
     952               0 :           return PDF_ENOMEM;
     953                 :         }
     954                 :     }
     955                 :   
     956              99 :   return PDF_OK;
     957                 : }
     958                 : 
     959                 : 
     960                 : /************************ UTF-32 to UTF-8 conversions *************************/
     961                 : 
     962                 : /* Static function to convert a given UTF-32HE character to UTF-8. The number
     963                 :  * of bytes used in the output UTF-8 point is returned (or 0 if the UTF-8 point
     964                 :  * is not valid */
     965                 : static pdf_size_t
     966                 : pdf_text_utf32he_point_to_utf8_point(const pdf_text_utf32_char_t utf32val,
     967                 :                                      pdf_text_utf8_char_t utf8val[4])
     968                 : {
     969                 :   pdf_size_t n_bytes;
     970                 : 
     971             381 :   if(utf32val.i < 0x80)
     972                 :     {
     973                 :       /* Output is 1 byte */
     974             365 :       n_bytes = 1;
     975             365 :       utf8val[0] = (pdf_text_utf8_char_t) utf32val.i;
     976                 :     }
     977              16 :   else if(utf32val.i < 0x800)
     978                 :     {
     979                 :       /* Output is 2 bytes */
     980               4 :       n_bytes = 2;
     981                 :       /* Get first byte, using upper 5 bits --> 110xxxxx */
     982               4 :       utf8val[0] = ((pdf_text_utf8_char_t) (utf32val.i >> 6)) | 0xC0;
     983                 :       /* Get second byte, using lower 6 bits --> 10xxxxxx */
     984               4 :       utf8val[1] = ((pdf_text_utf8_char_t) (utf32val.i & 0x3F)) | 0x80;
     985                 :     }
     986              12 :   else if(utf32val.i < 0x10000)
     987                 :     {
     988                 :       /* Output is 3 bytes */
     989               4 :       n_bytes = 3;
     990                 :       /* Get first byte, using upper 4 bits --> 1110xxxx */
     991               4 :       utf8val[0] = ((pdf_text_utf8_char_t)(utf32val.i >> 12)) | 0xE0;
     992                 :       /* Get second byte, using middle 6 bits --> 10xxxxxx */
     993               4 :       utf8val[1] = ((pdf_text_utf8_char_t)((utf32val.i >> 6) & 0x3F)) | 0x80;
     994                 :       /* Get third byte, using lower 6 bits --> 10xxxxxx */
     995               4 :       utf8val[2] = ((pdf_text_utf8_char_t)(utf32val.i & 0x3F)) | 0x80;
     996                 :     }
     997               8 :   else if(utf32val.i < 0x0010FFFF)
     998                 :     {
     999                 :       /* Output is 4 bytes */
    1000               8 :       n_bytes = 4;
    1001                 :       /* Get first byte, using upper 3 bits --> 11110xxx */
    1002               8 :       utf8val[0] = ((pdf_text_utf8_char_t)(utf32val.i >> 18)) | 0xF0;
    1003                 :       /* Get second byte, using upper-middle 6 bits --> 10xxxxxx */
    1004               8 :       utf8val[1] = (((pdf_text_utf8_char_t)(utf32val.i >> 12)) & 0x3F) | 0x80;
    1005                 :       /* Get second byte, using lower-middle 6 bits --> 10xxxxxx */
    1006               8 :       utf8val[2] = (((pdf_text_utf8_char_t)(utf32val.i >> 6)) & 0x3F) | 0x80;
    1007                 :       /* Get third byte, using lower 6 bits --> 10xxxxxx */
    1008               8 :       utf8val[3] = ((pdf_text_utf8_char_t)(utf32val.i & 0x3F)) | 0x80;
    1009                 :     }
    1010                 :   else
    1011                 :     {
    1012                 :       /* Invalid input UTF-32 val */
    1013                 :       PDF_DEBUG_BASE("Wrong UTF-32BE value! '0x%.2X 0x%.2X 0x%.2X 0x%.2X'",
    1014                 :                      utf32val.c[0],utf32val.c[1],utf32val.c[2],utf32val.c[3]);
    1015               0 :       n_bytes = 0;
    1016                 :     }
    1017             381 :   return n_bytes;
    1018                 : }
    1019                 : 
    1020                 : /* Function to convert from UTF-32HE to UTF-8, lossless */
    1021                 : pdf_status_t
    1022                 : pdf_text_utf32he_to_utf8(const pdf_char_t      *input_data,
    1023                 :                          const pdf_size_t      input_length,
    1024                 :                          pdf_char_t            **p_output_data,
    1025                 :                          pdf_size_t            *p_output_length)
    1026              23 : {
    1027                 :   /* Note: UTF-8 has either 8, 16, 24 or 32 bits per character.
    1028                 :    This means that, if length of origin string is 4N bytes, the number of
    1029                 :    required bytes for the UTF-8 representation of the string is 4N in
    1030                 :    the worst case. (When all the UTF-8 representations have 32bits)
    1031                 :    */
    1032                 :   pdf_size_t new_string_length;
    1033                 :   pdf_size_t new_string_length_worst;
    1034                 :   int i;  /* index for the origin string data */
    1035                 :   int j;  /* index for the destination string data */
    1036                 :   pdf_char_t *data;
    1037                 :   pdf_size_t delta_in_utf8;
    1038                 :   
    1039                 :   /* Get new string length (worst case)... */
    1040              23 :   new_string_length_worst = input_length;
    1041                 :   /* Create destination string with correct size (but empty!) */
    1042              23 :   data = (pdf_char_t *)pdf_alloc(new_string_length_worst);
    1043              23 :   if(data == NULL)
    1044                 :     {
    1045               0 :       return PDF_ENOMEM;
    1046                 :     }
    1047                 :   
    1048                 :   /* Initiate real string length, without considering marker bytes */
    1049              23 :   new_string_length = 0;
    1050                 :   
    1051             404 :   for( i = 0, j = 0; i < input_length; i += 4, j += delta_in_utf8 )
    1052                 :     {
    1053                 :       pdf_text_utf32_char_t utf32val;
    1054                 :       pdf_text_utf8_char_t utf8val[4];
    1055                 : 
    1056                 :       /* Get UTF-32 char, as a direct memory copy from the input array */
    1057             381 :       memcpy(&utf32val, &(input_data[i]), 4);
    1058                 :       
    1059             762 :       delta_in_utf8 = pdf_text_utf32he_point_to_utf8_point(utf32val,utf8val);
    1060             381 :       if(delta_in_utf8 == 0)
    1061                 :         {
    1062                 :           PDF_DEBUG_BASE("Problem encoding UTF-8 string");
    1063               0 :           pdf_dealloc(data);
    1064               0 :           return PDF_EBADTEXT;
    1065                 :         }
    1066                 :       
    1067                 :       /* Store UTF-8 val in output array */
    1068             381 :       memcpy(&data[j], &(utf8val[0]), delta_in_utf8);
    1069                 :       
    1070                 :       /* Update new_string_length, depending on the bytes used to represent
    1071                 :        *  this character in UTF-8 */
    1072             381 :       new_string_length += delta_in_utf8;
    1073                 :     }
    1074                 :   
    1075                 :   /* If everything went ok, set output data */
    1076              23 :   *p_output_data = data;
    1077                 :   /* Set final output length of the generated string */
    1078              23 :   *p_output_length = new_string_length;
    1079                 :   
    1080                 :   /* If the real required string length is not equal to the initial worst length
    1081                 :    * then update string with correct length. */
    1082              23 :   if(new_string_length != new_string_length_worst)
    1083                 :     {
    1084                 :       /* Recreate object with correct smaller size... */
    1085              19 :       *p_output_data = (pdf_char_t *)pdf_realloc(*p_output_data,
    1086                 :                                                  new_string_length);
    1087              19 :       if(*p_output_data == NULL)
    1088                 :         {
    1089               0 :           return PDF_ENOMEM;
    1090                 :         }
    1091                 :     }
    1092                 :   
    1093              23 :   return PDF_OK;
    1094                 : }
    1095                 : 
    1096                 : 
    1097                 : /*************************** BOM-related functions ****************************/
    1098                 : 
    1099                 : 
    1100                 : inline pdf_text_bom_t
    1101                 : pdf_text_get_unicode_bom(enum pdf_text_unicode_encoding_e unicode_encoding)
    1102             108 : {
    1103             108 :   return unicode_bom[unicode_encoding];
    1104                 : }
    1105                 : 
    1106                 : 
    1107                 : pdf_bool_t
    1108                 : pdf_text_check_unicode_bom (const pdf_char_t *data,
    1109                 :                             const pdf_size_t size,
    1110                 :                             enum pdf_text_unicode_encoding_e enc,
    1111                 :                             int swap)
    1112            1023 : {
    1113            1023 :   switch(enc)
    1114                 :   {
    1115                 :     case PDF_TEXT_UTF16_HE:
    1116                 :     case PDF_TEXT_UTF32_HE:
    1117                 :     {
    1118             802 :       enc += ((PDF_IS_BIG_ENDIAN ^ swap) ? PDF_TEXT_HE_TO_BE:PDF_TEXT_HE_TO_LE);
    1119                 :     }
    1120                 :     case PDF_TEXT_UTF8:
    1121                 :     case PDF_TEXT_UTF16_BE:
    1122                 :     case PDF_TEXT_UTF16_LE:
    1123                 :     case PDF_TEXT_UTF32_BE:
    1124                 :     case PDF_TEXT_UTF32_LE:
    1125                 :     {
    1126            1023 :       if((size >= unicode_bom[enc].bom_bytes) && \
    1127                 :          (memcmp(data,unicode_bom[enc].bom_data,unicode_bom[enc].bom_bytes)==0))
    1128                 :         {
    1129              96 :           return PDF_TRUE;
    1130                 :         }
    1131                 :     }
    1132                 :     default:
    1133             927 :       return PDF_FALSE;
    1134                 :   }
    1135                 : }
    1136                 : 
    1137                 : /* End of pdf-text-encoding.c */

Generated by: LTP GCOV extension version 1.6