1 : /* -*- mode: C -*-
2 : *
3 : * File: pdf-text-filter.c
4 : * Date: Fri Feb 25 23:58:56 2008
5 : *
6 : * GNU PDF Library - Encoded Text Filters
7 : *
8 : */
9 :
10 : /* Copyright (C) 2008 Free Software Foundation, Inc. */
11 :
12 : /* This program is free software: you can redistribute it and/or modify
13 : * it under the terms of the GNU General Public License as published by
14 : * the Free Software Foundation, either version 3 of the License, or
15 : * (at your option) any later version.
16 : *
17 : * This program is distributed in the hope that it will be useful,
18 : * but WITHOUT ANY WARRANTY; without even the implied warranty of
19 : * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
20 : * GNU General Public License for more details.
21 : *
22 : * You should have received a copy of the GNU General Public License
23 : * along with this program. If not, see <http://www.gnu.org/licenses/>.
24 : */
25 :
26 : #include <config.h>
27 :
28 : #include <string.h>
29 : #include <stdio.h>
30 :
31 : #include <pdf-text.h>
32 : #include <pdf-text-filter.h>
33 : #include <pdf-text-context.h>
34 : #include <pdf-text-ucd.h>
35 :
36 :
37 :
38 : /* Change Case of text */
39 : static pdf_status_t
40 : pdf_text_filter_change_case(pdf_text_t text,
41 : enum unicode_case_type new_case)
42 54 : {
43 : pdf_size_t i;
44 : pdf_size_t n_words;
45 : pdf_size_t worst_length;
46 : pdf_size_t new_length;
47 : pdf_char_t *new_data;
48 : pdf_list_t new_wb_list;
49 :
50 : const pdf_char_t *language;
51 :
52 : /* Generate original word boundaries list, if not already done */
53 54 : if(pdf_text_generate_word_boundaries(text) != PDF_OK)
54 : {
55 : PDF_DEBUG_BASE("Couldn't create list of word boundaries");
56 0 : return PDF_ETEXTENC;
57 : }
58 :
59 : /* Get text language ID. First, try to get it from the pdf_text_t element */
60 54 : language = pdf_text_get_language(text);
61 : /* If text element doesn't have a language ID, get it from the text context */
62 54 : if(strlen((char *)language) == 0)
63 : {
64 18 : language = pdf_text_context_get_host_language();
65 : }
66 :
67 : /* Worst length will be having 3 output UTF-32 characters per each input
68 : * UTF-32 character. First of all, allocate memory for the worst length */
69 54 : worst_length = text->size * UCD_SC_MAX_EXPAND;
70 54 : new_data = (pdf_char_t *)pdf_alloc(worst_length);
71 54 : if(new_data == NULL)
72 : {
73 0 : return PDF_ENOMEM;
74 : }
75 :
76 : /* Create new empty word boundaries list */
77 54 : if(pdf_text_create_word_boundaries_list(&new_wb_list) != PDF_OK)
78 : {
79 : PDF_DEBUG_BASE("Unable to create empty list");
80 0 : pdf_dealloc(new_data);
81 0 : return PDF_ETEXTENC;
82 : }
83 :
84 : /* Walk list of words, uppercasing all of them */
85 108 : n_words = pdf_list_size(text->word_boundaries);
86 54 : new_length = 0;
87 210 : for(i = 0; i < n_words; ++i)
88 : {
89 : struct pdf_text_wb_s *p_new_word;
90 : struct pdf_text_wb_s *p_word;
91 156 : pdf_size_t new_word_length = 0;
92 : pdf_status_t ret_code;
93 :
94 : /* Allocate new word */
95 156 : p_new_word = (struct pdf_text_wb_s *)pdf_alloc(sizeof(struct pdf_text_wb_s));
96 156 : if(p_new_word == NULL)
97 : {
98 0 : return PDF_ENOMEM;
99 : }
100 :
101 : /* Get word to process from list of words */
102 312 : if(pdf_list_get_at(text->word_boundaries, \
103 : i, \
104 : (const void **)&p_word) != PDF_OK)
105 : {
106 0 : pdf_dealloc(new_data);
107 0 : pdf_list_destroy(new_wb_list);
108 0 : return PDF_ETEXTENC;
109 : }
110 :
111 : /* Apply the case algorithm to the full word */
112 156 : if((ret_code = pdf_text_ucd_word_change_case(&new_data[new_length],
113 : &new_word_length,
114 : new_case,
115 : p_word->word_start,
116 : p_word->word_size,
117 : language)) != PDF_OK)
118 : {
119 : PDF_DEBUG_BASE("Problem x-casing full word");
120 0 : pdf_list_destroy(new_wb_list);
121 0 : pdf_dealloc(new_data);
122 0 : pdf_dealloc(p_new_word);
123 0 : return ret_code;
124 : }
125 :
126 : /* Create new word info */
127 156 : p_new_word->word_start = &new_data[new_length];
128 156 : p_new_word->word_size = new_word_length;
129 156 : p_new_word->word_stop = &new_data[new_length + new_word_length -4];
130 :
131 : /* Add word to new list */
132 156 : pdf_list_add_last(new_wb_list, p_new_word, NULL);
133 :
134 : /* Update new length */
135 156 : new_length += new_word_length;
136 : }
137 :
138 :
139 : /* Finally, reset the buffer length to its correct size */
140 54 : if(new_length != worst_length)
141 : {
142 54 : new_data = (pdf_char_t *)pdf_realloc(new_data,new_length);
143 54 : if(new_data == NULL)
144 : {
145 0 : pdf_text_destroy_word_boundaries_list(&new_wb_list);
146 0 : return PDF_ENOMEM;
147 : }
148 : }
149 :
150 : /* Replace contents (data and word boundary list) */
151 54 : pdf_dealloc(text->data);
152 54 : text->data = new_data;
153 54 : text->size = new_length;
154 54 : pdf_text_destroy_word_boundaries_list(&(text->word_boundaries));
155 54 : text->word_boundaries = new_wb_list;
156 :
157 54 : return PDF_OK;
158 : }
159 :
160 : /* Make all text Upper Case */
161 : pdf_status_t
162 : pdf_text_filter_upper_case(pdf_text_t text)
163 18 : {
164 18 : return pdf_text_filter_change_case(text, UNICODE_CASE_INFO_UPPER_CASE);
165 : }
166 :
167 : /* Make all text Lower Case */
168 : pdf_status_t
169 : pdf_text_filter_lower_case(pdf_text_t text)
170 18 : {
171 18 : return pdf_text_filter_change_case(text, UNICODE_CASE_INFO_LOWER_CASE);
172 : }
173 :
174 :
175 : /* Make all text Title Case */
176 : pdf_status_t
177 : pdf_text_filter_title_case(pdf_text_t text)
178 18 : {
179 18 : return pdf_text_filter_change_case(text, UNICODE_CASE_INFO_TITLE_CASE);
180 : }
181 :
182 :
183 : /* Remove all single ampersands, and turn ' && ' into ' & ' */
184 : pdf_status_t
185 : pdf_text_filter_remove_amp(pdf_text_t text)
186 2 : {
187 : pdf_status_t ret_code;
188 2 : ret_code = pdf_text_replace_ascii(text,(pdf_char_t *)" ",(pdf_char_t *)" & ");
189 2 : if(ret_code != PDF_OK)
190 : {
191 0 : return ret_code;
192 : }
193 2 : return pdf_text_replace_ascii(text,(pdf_char_t *)" & ",(pdf_char_t *)" && ");
194 : }
195 :
196 :
197 : /* Convert all ASCII code points to their Full-Width variants. These Full-Width
198 : * variants are located in the FF00-FF60 range as follows:
199 : * - The range U+FF01-U+FF5E contains the full width variants of the ASCII
200 : * characters in the range U+0021-U+007E.
201 : * - The range U+FF5F-U+FF60 contains the full width variants of double
202 : * parentheses in the range U+2985-U+2986
203 : */
204 : pdf_status_t
205 : pdf_text_filter_normalize_full_width_ascii(pdf_text_t text)
206 3 : {
207 : pdf_size_t i;
208 3 : const pdf_u32_t delta = 0xFF01 - 0x0021;
209 3 : const pdf_u32_t delta2 = 0xFF5F - 0x2985;
210 :
211 11 : for(i=0; i<text->size; i+=4)
212 : {
213 : pdf_u32_t unicode_point;
214 : /* Get unicode point in UTF-32HE */
215 8 : memcpy(&unicode_point, &(text->data[i]), 4);
216 :
217 : /* Check ranges */
218 8 : if((unicode_point <= 0x007E) && \
219 : (unicode_point >= 0x0021))
220 : {
221 3 : unicode_point += delta;
222 3 : memcpy(&(text->data[i]), &unicode_point, 4);
223 : }
224 5 : else if((unicode_point >= 0x2985) && \
225 : (unicode_point <= 0x2986))
226 : {
227 0 : unicode_point += delta2;
228 0 : memcpy(&(text->data[i]), &unicode_point, 4);
229 : }
230 : }
231 3 : return PDF_OK;
232 : }
233 :
234 :
235 : /* Substitute line endings with a given UTF-8 pattern. */
236 : static pdf_status_t
237 : pdf_text_substitute_line_ending(pdf_text_t text, const pdf_text_eol_t new_eol)
238 4 : {
239 4 : pdf_status_t ret_code = PDF_OK;
240 : int i;
241 : pdf_text_t new_text_pattern;
242 : pdf_text_t *eols;
243 :
244 : /* Allocate memory for pdf_text_t old eols */
245 4 : eols = (pdf_text_t *)pdf_alloc(PDF_TEXT_EOLMAX * sizeof(pdf_text_t));
246 4 : if(eols == NULL)
247 : {
248 0 : return PDF_ENOMEM;
249 : }
250 :
251 : /* Create text new pattern */
252 4 : if(pdf_text_new_from_unicode(new_eol->sequence,
253 : strlen((char *)new_eol->sequence),
254 : PDF_TEXT_UTF8,
255 : &new_text_pattern) != PDF_OK)
256 : {
257 0 : pdf_dealloc(eols);
258 : PDF_DEBUG_BASE("New EOL is not UTF-8");
259 0 : return PDF_EBADTEXT;
260 : }
261 :
262 : /* For each possible EOL type, create a pdf_text_t */
263 20 : for(i = PDF_TEXT_EOL_WINDOWS; i < PDF_TEXT_EOLMAX; ++i)
264 : {
265 : pdf_text_eol_t requested_eol;
266 :
267 : /* Get Host EOL */
268 16 : requested_eol = pdf_text_context_get_host_eol((enum pdf_text_eol_types)i);
269 :
270 :
271 : /* Create text old pattern */
272 16 : if(pdf_text_new_from_unicode(requested_eol->sequence,
273 : strlen((char *)requested_eol->sequence),
274 : PDF_TEXT_UTF8,
275 : &eols[i]) != PDF_OK)
276 : {
277 0 : pdf_text_destroy(new_text_pattern);
278 0 : pdf_dealloc(eols);
279 : PDF_DEBUG_BASE("Old EOL is not UTF-8");
280 0 : return PDF_EBADTEXT;
281 : }
282 : }
283 :
284 : /* Perform the replacement */
285 4 : ret_code = pdf_text_replace_multiple(text,
286 : new_text_pattern,
287 : eols,
288 : PDF_TEXT_EOLMAX);
289 :
290 : /* Destroy used patterns */
291 20 : for(i = PDF_TEXT_EOL_WINDOWS; i < PDF_TEXT_EOLMAX; i++)
292 : {
293 16 : pdf_text_destroy(eols[i]);
294 : }
295 4 : pdf_dealloc(eols);
296 4 : pdf_text_destroy(new_text_pattern);
297 :
298 4 : return ret_code;
299 : }
300 :
301 :
302 :
303 : /* Normalize all EOL sequences to the default host EOL */
304 : pdf_status_t
305 : pdf_text_filter_normalize_line_endings(pdf_text_t text)
306 2 : {
307 : pdf_text_eol_t host_eol;
308 : /* Get this host EOL */
309 2 : host_eol = pdf_text_context_get_host_eol(PDF_TEXT_EOL_HOST);
310 : /* Finally, substitute line endings */
311 2 : return pdf_text_substitute_line_ending(text, host_eol);
312 : }
313 :
314 :
315 : /* Replace EOL sequences with white spaces */
316 : pdf_status_t
317 : pdf_text_filter_remove_line_endings(pdf_text_t text)
318 2 : {
319 2 : const struct pdf_text_eol_s empty_eol = { { 0x00, 0x00, 0x00 } };
320 : /* Substitute line endings */
321 2 : return pdf_text_substitute_line_ending(text, (pdf_text_eol_t)(&empty_eol));
322 : }
323 :
324 :
325 : /* End of pdf-text-filter.c */
|