LTP GCOV extension - code coverage report
Current view: directory - src/base - pdf-text-filter.c
Test: libgnupdf.info
Date: 2010-07-31 Instrumented lines: 94
Code covered: 75.5 % Executed lines: 71

       1                 : /* -*- mode: C -*-
       2                 :  *
       3                 :  *       File:         pdf-text-filter.c
       4                 :  *       Date:         Fri Feb 25 23:58:56 2008
       5                 :  *
       6                 :  *       GNU PDF Library - Encoded Text Filters
       7                 :  *
       8                 :  */
       9                 : 
      10                 : /* Copyright (C) 2008 Free Software Foundation, Inc. */
      11                 : 
      12                 : /* This program is free software: you can redistribute it and/or modify
      13                 :  * it under the terms of the GNU General Public License as published by
      14                 :  * the Free Software Foundation, either version 3 of the License, or
      15                 :  * (at your option) any later version.
      16                 :  *
      17                 :  * This program is distributed in the hope that it will be useful,
      18                 :  * but WITHOUT ANY WARRANTY; without even the implied warranty of
      19                 :  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
      20                 :  * GNU General Public License for more details.
      21                 :  *
      22                 :  * You should have received a copy of the GNU General Public License
      23                 :  * along with this program.  If not, see <http://www.gnu.org/licenses/>.
      24                 :  */
      25                 : 
      26                 : #include <config.h>
      27                 : 
      28                 : #include <string.h>
      29                 : #include <stdio.h>
      30                 : 
      31                 : #include <pdf-text.h>
      32                 : #include <pdf-text-filter.h>
      33                 : #include <pdf-text-context.h>
      34                 : #include <pdf-text-ucd.h>
      35                 : 
      36                 : 
      37                 : 
      38                 : /* Change Case of text */
      39                 : static pdf_status_t
      40                 : pdf_text_filter_change_case(pdf_text_t text,
      41                 :                             enum unicode_case_type new_case)
      42              54 : {
      43                 :   pdf_size_t i;
      44                 :   pdf_size_t n_words;
      45                 :   pdf_size_t worst_length;
      46                 :   pdf_size_t new_length;
      47                 :   pdf_char_t *new_data;
      48                 :   pdf_list_t new_wb_list;
      49                 :   
      50                 :   const pdf_char_t *language;
      51                 : 
      52                 :   /* Generate original word boundaries list, if not already done */
      53              54 :   if(pdf_text_generate_word_boundaries(text) != PDF_OK)
      54                 :     {
      55                 :       PDF_DEBUG_BASE("Couldn't create list of word boundaries");
      56               0 :       return PDF_ETEXTENC;
      57                 :     }
      58                 :   
      59                 :   /* Get text language ID. First, try to get it from the pdf_text_t element */
      60              54 :   language = pdf_text_get_language(text);
      61                 :   /* If text element doesn't have a language ID, get it from the text context */
      62              54 :   if(strlen((char *)language) == 0)
      63                 :     {
      64              18 :       language = pdf_text_context_get_host_language();
      65                 :     }
      66                 :   
      67                 :   /* Worst length will be having 3 output UTF-32 characters per each input
      68                 :    *  UTF-32 character. First of all, allocate memory for the worst length */
      69              54 :   worst_length = text->size * UCD_SC_MAX_EXPAND;
      70              54 :   new_data = (pdf_char_t *)pdf_alloc(worst_length);
      71              54 :   if(new_data == NULL)
      72                 :     {
      73               0 :       return PDF_ENOMEM;
      74                 :     }
      75                 :   
      76                 :   /* Create new empty word boundaries list */
      77              54 :   if(pdf_text_create_word_boundaries_list(&new_wb_list) != PDF_OK)
      78                 :     {
      79                 :       PDF_DEBUG_BASE("Unable to create empty list");
      80               0 :       pdf_dealloc(new_data);
      81               0 :       return PDF_ETEXTENC;
      82                 :     }
      83                 : 
      84                 :   /* Walk list of words, uppercasing all of them */
      85             108 :   n_words = pdf_list_size(text->word_boundaries);
      86              54 :   new_length = 0;
      87             210 :   for(i = 0; i < n_words; ++i)
      88                 :     {
      89                 :       struct pdf_text_wb_s *p_new_word;
      90                 :       struct pdf_text_wb_s *p_word;
      91             156 :       pdf_size_t new_word_length = 0;
      92                 :       pdf_status_t ret_code;
      93                 : 
      94                 :       /* Allocate new word */
      95             156 :       p_new_word = (struct pdf_text_wb_s *)pdf_alloc(sizeof(struct pdf_text_wb_s));
      96             156 :       if(p_new_word == NULL)
      97                 :         {
      98               0 :           return PDF_ENOMEM;
      99                 :         }
     100                 : 
     101                 :       /* Get word to process from list of words */
     102             312 :       if(pdf_list_get_at(text->word_boundaries, \
     103                 :                          i, \
     104                 :                          (const void **)&p_word) != PDF_OK)
     105                 :         {
     106               0 :           pdf_dealloc(new_data);
     107               0 :           pdf_list_destroy(new_wb_list);
     108               0 :           return PDF_ETEXTENC;
     109                 :         }
     110                 : 
     111                 :       /* Apply the case algorithm to the full word */
     112             156 :       if((ret_code = pdf_text_ucd_word_change_case(&new_data[new_length],
     113                 :                                                    &new_word_length,
     114                 :                                                    new_case,
     115                 :                                                    p_word->word_start,
     116                 :                                                    p_word->word_size,
     117                 :                                                    language)) != PDF_OK)
     118                 :         {
     119                 :           PDF_DEBUG_BASE("Problem x-casing full word");
     120               0 :           pdf_list_destroy(new_wb_list);
     121               0 :           pdf_dealloc(new_data);
     122               0 :           pdf_dealloc(p_new_word);
     123               0 :           return ret_code;
     124                 :         }
     125                 : 
     126                 :       /* Create new word info */
     127             156 :       p_new_word->word_start = &new_data[new_length];
     128             156 :       p_new_word->word_size = new_word_length;
     129             156 :       p_new_word->word_stop = &new_data[new_length + new_word_length -4];
     130                 : 
     131                 :       /* Add word to new list */
     132             156 :       pdf_list_add_last(new_wb_list, p_new_word, NULL);
     133                 : 
     134                 :       /* Update new length */
     135             156 :       new_length += new_word_length;
     136                 :     }
     137                 : 
     138                 :   
     139                 :   /* Finally, reset the buffer length to its correct size */
     140              54 :   if(new_length != worst_length)
     141                 :     {
     142              54 :       new_data = (pdf_char_t *)pdf_realloc(new_data,new_length);
     143              54 :       if(new_data == NULL)
     144                 :         {
     145               0 :           pdf_text_destroy_word_boundaries_list(&new_wb_list);
     146               0 :           return PDF_ENOMEM;
     147                 :         }
     148                 :     }
     149                 :   
     150                 :   /* Replace contents (data and word boundary list) */
     151              54 :   pdf_dealloc(text->data);
     152              54 :   text->data = new_data;
     153              54 :   text->size = new_length;
     154              54 :   pdf_text_destroy_word_boundaries_list(&(text->word_boundaries));
     155              54 :   text->word_boundaries = new_wb_list;
     156                 :   
     157              54 :   return PDF_OK;
     158                 : }
     159                 : 
     160                 : /* Make all text Upper Case */
     161                 : pdf_status_t
     162                 : pdf_text_filter_upper_case(pdf_text_t text)
     163              18 : {
     164              18 :   return pdf_text_filter_change_case(text, UNICODE_CASE_INFO_UPPER_CASE);
     165                 : }
     166                 : 
     167                 : /* Make all text Lower Case */
     168                 : pdf_status_t
     169                 : pdf_text_filter_lower_case(pdf_text_t text)
     170              18 : {
     171              18 :   return pdf_text_filter_change_case(text, UNICODE_CASE_INFO_LOWER_CASE);
     172                 : }
     173                 : 
     174                 : 
     175                 : /* Make all text Title Case */
     176                 : pdf_status_t
     177                 : pdf_text_filter_title_case(pdf_text_t text)
     178              18 : {
     179              18 :   return pdf_text_filter_change_case(text, UNICODE_CASE_INFO_TITLE_CASE);
     180                 : }
     181                 : 
     182                 : 
     183                 : /* Remove all single ampersands, and turn ' && ' into ' & ' */
     184                 : pdf_status_t
     185                 : pdf_text_filter_remove_amp(pdf_text_t text)
     186               2 : {
     187                 :   pdf_status_t ret_code;
     188               2 :   ret_code = pdf_text_replace_ascii(text,(pdf_char_t *)" ",(pdf_char_t *)" & ");
     189               2 :   if(ret_code != PDF_OK)
     190                 :     {
     191               0 :       return ret_code;
     192                 :     }
     193               2 :   return pdf_text_replace_ascii(text,(pdf_char_t *)" & ",(pdf_char_t *)" && ");
     194                 : }
     195                 : 
     196                 : 
     197                 : /* Convert all ASCII code points to their Full-Width variants. These Full-Width
     198                 :  *   variants are located in the FF00-FF60 range as follows:
     199                 :  * - The range U+FF01-U+FF5E contains the full width variants of the ASCII
     200                 :  *   characters in the range U+0021-U+007E.
     201                 :  * - The range U+FF5F-U+FF60 contains the full width variants of double
     202                 :  *   parentheses in the range U+2985-U+2986
     203                 :  */
     204                 : pdf_status_t
     205                 : pdf_text_filter_normalize_full_width_ascii(pdf_text_t text)
     206               3 : {
     207                 :   pdf_size_t i;
     208               3 :   const pdf_u32_t delta = 0xFF01 - 0x0021;
     209               3 :   const pdf_u32_t delta2 = 0xFF5F - 0x2985;
     210                 : 
     211              11 :   for(i=0; i<text->size; i+=4)
     212                 :     {
     213                 :       pdf_u32_t unicode_point;
     214                 :       /* Get unicode point in UTF-32HE */
     215               8 :       memcpy(&unicode_point, &(text->data[i]), 4);
     216                 : 
     217                 :       /* Check ranges */
     218               8 :       if((unicode_point <= 0x007E) && \
     219                 :          (unicode_point >= 0x0021))
     220                 :         {
     221               3 :           unicode_point += delta;
     222               3 :           memcpy(&(text->data[i]), &unicode_point, 4);
     223                 :         }
     224               5 :       else if((unicode_point >= 0x2985) && \
     225                 :               (unicode_point <= 0x2986))
     226                 :         {
     227               0 :           unicode_point += delta2;
     228               0 :           memcpy(&(text->data[i]), &unicode_point, 4);
     229                 :         }
     230                 :     }
     231               3 :   return PDF_OK;
     232                 : }
     233                 : 
     234                 : 
     235                 : /* Substitute line endings with a given UTF-8 pattern. */
     236                 : static pdf_status_t
     237                 : pdf_text_substitute_line_ending(pdf_text_t text, const pdf_text_eol_t new_eol)
     238               4 : {
     239               4 :   pdf_status_t ret_code = PDF_OK;
     240                 :   int i;
     241                 :   pdf_text_t new_text_pattern;
     242                 :   pdf_text_t *eols;
     243                 : 
     244                 :   /* Allocate memory for pdf_text_t old eols */
     245               4 :   eols = (pdf_text_t *)pdf_alloc(PDF_TEXT_EOLMAX * sizeof(pdf_text_t));
     246               4 :   if(eols == NULL)
     247                 :     {
     248               0 :       return PDF_ENOMEM;
     249                 :     }
     250                 : 
     251                 :   /* Create text new pattern */
     252               4 :   if(pdf_text_new_from_unicode(new_eol->sequence,
     253                 :                                strlen((char *)new_eol->sequence),
     254                 :                                PDF_TEXT_UTF8,
     255                 :                                &new_text_pattern) != PDF_OK)
     256                 :     {
     257               0 :       pdf_dealloc(eols);
     258                 :       PDF_DEBUG_BASE("New EOL is not UTF-8");
     259               0 :       return PDF_EBADTEXT;
     260                 :     }
     261                 : 
     262                 :   /* For each possible EOL type, create a pdf_text_t */
     263              20 :   for(i = PDF_TEXT_EOL_WINDOWS; i < PDF_TEXT_EOLMAX; ++i)
     264                 :     {
     265                 :       pdf_text_eol_t requested_eol;
     266                 : 
     267                 :       /* Get Host EOL */
     268              16 :       requested_eol = pdf_text_context_get_host_eol((enum pdf_text_eol_types)i);
     269                 : 
     270                 : 
     271                 :       /* Create text old pattern */
     272              16 :       if(pdf_text_new_from_unicode(requested_eol->sequence,
     273                 :                                    strlen((char *)requested_eol->sequence),
     274                 :                                    PDF_TEXT_UTF8,
     275                 :                                    &eols[i]) != PDF_OK)
     276                 :         {
     277               0 :           pdf_text_destroy(new_text_pattern);
     278               0 :           pdf_dealloc(eols);
     279                 :           PDF_DEBUG_BASE("Old EOL is not UTF-8");
     280               0 :           return PDF_EBADTEXT;
     281                 :         }
     282                 :     }
     283                 : 
     284                 :   /* Perform the replacement */
     285               4 :   ret_code = pdf_text_replace_multiple(text,
     286                 :                                        new_text_pattern,
     287                 :                                        eols,
     288                 :                                        PDF_TEXT_EOLMAX);
     289                 : 
     290                 :   /* Destroy used patterns */
     291              20 :   for(i = PDF_TEXT_EOL_WINDOWS; i < PDF_TEXT_EOLMAX; i++)
     292                 :     {
     293              16 :       pdf_text_destroy(eols[i]);
     294                 :     }
     295               4 :   pdf_dealloc(eols);
     296               4 :   pdf_text_destroy(new_text_pattern);
     297                 : 
     298               4 :   return ret_code;
     299                 : }
     300                 : 
     301                 : 
     302                 : 
     303                 : /* Normalize all EOL sequences to the default host EOL */
     304                 : pdf_status_t
     305                 : pdf_text_filter_normalize_line_endings(pdf_text_t text)
     306               2 : {
     307                 :   pdf_text_eol_t host_eol;
     308                 :   /* Get this host EOL */
     309               2 :   host_eol = pdf_text_context_get_host_eol(PDF_TEXT_EOL_HOST);
     310                 :   /* Finally, substitute line endings */
     311               2 :   return pdf_text_substitute_line_ending(text, host_eol);
     312                 : }
     313                 : 
     314                 : 
     315                 : /* Replace EOL sequences with white spaces */
     316                 : pdf_status_t
     317                 : pdf_text_filter_remove_line_endings(pdf_text_t text)
     318               2 : {
     319               2 :   const struct pdf_text_eol_s empty_eol =  { { 0x00, 0x00, 0x00 } };
     320                 :   /* Substitute line endings */
     321               2 :   return pdf_text_substitute_line_ending(text, (pdf_text_eol_t)(&empty_eol));
     322                 : }
     323                 : 
     324                 : 
     325                 : /* End of pdf-text-filter.c */

Generated by: LTP GCOV extension version 1.6