1 : /* -*- mode: C -*-
2 : *
3 : * File: pdf-token-reader.c
4 : * Date: Mon Dec 29 00:45:09 2008
5 : *
6 : * GNU PDF Library - Stream tokeniser
7 : *
8 : */
9 :
10 : /* Copyright (C) 2008, 2009 Free Software Foundation, Inc. */
11 :
12 : /* This program is free software: you can redistribute it and/or modify
13 : * it under the terms of the GNU General Public License as published by
14 : * the Free Software Foundation, either version 3 of the License, or
15 : * (at your option) any later version.
16 : *
17 : * This program is distributed in the hope that it will be useful,
18 : * but WITHOUT ANY WARRANTY; without even the implied warranty of
19 : * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
20 : * GNU General Public License for more details.
21 : *
22 : * You should have received a copy of the GNU General Public License
23 : * along with this program. If not, see <http://www.gnu.org/licenses/>.
24 : */
25 :
26 : #include <config.h>
27 :
28 : #include <assert.h>
29 : #include <stdlib.h>
30 : #include <string.h>
31 : #include <locale.h>
32 :
33 : #include <pdf-token-reader.h>
34 :
35 : static INLINE pdf_status_t store_char (pdf_token_reader_t reader,
36 : pdf_char_t ch);
37 : static INLINE pdf_status_t store_char_grow (pdf_token_reader_t reader,
38 : pdf_char_t ch);
39 : static pdf_status_t exit_state (pdf_token_reader_t reader, pdf_u32_t flags,
40 : pdf_token_t *token);
41 : static INLINE pdf_status_t enter_state (pdf_token_reader_t reader,
42 : enum pdf_token_reader_state_e state);
43 : static pdf_status_t flush_token (pdf_token_reader_t reader, pdf_u32_t flags,
44 : pdf_token_t *token);
45 : static pdf_status_t handle_char (pdf_token_reader_t reader, pdf_u32_t flags,
46 : pdf_char_t ch, pdf_token_t *token);
47 : static INLINE pdf_status_t handle_string_char (pdf_token_reader_t reader,
48 : pdf_u32_t flags,
49 : pdf_char_t ch,
50 : pdf_token_t *token);
51 : static INLINE pdf_status_t handle_hexstring_char (pdf_token_reader_t reader,
52 : pdf_u32_t flags,
53 : pdf_char_t ch,
54 : pdf_token_t *token);
55 : static int recognise_number (pdf_buffer_t buffer, int *int_value);
56 : static INLINE int parse_integer (pdf_buffer_t buffer, int *int_value,
57 : int *int_state);
58 : static INLINE pdf_status_t parse_real (pdf_buffer_t buffer,
59 : char *locale_dec_pt,
60 : double *value);
61 : static INLINE int validate_real (pdf_buffer_t buffer, int int_state);
62 :
63 :
64 : pdf_status_t
65 : pdf_token_reader_new (pdf_stm_t stm, pdf_token_reader_t *reader)
66 701 : {
67 : pdf_status_t err;
68 : pdf_token_reader_t new_tokr;
69 :
70 701 : err = PDF_ENOMEM;
71 701 : new_tokr = pdf_alloc (sizeof (*new_tokr));
72 701 : if (!new_tokr)
73 0 : goto fail;
74 :
75 701 : new_tokr->beg_pos = 0;
76 701 : new_tokr->state_pos = 0;
77 :
78 : /* determine the current locale's decimal point
79 : * (avoid using localeconv since it may not be thread-safe) */
80 701 : new_tokr->decimal_point = NULL;
81 : {
82 : int len;
83 : char decpt[16];
84 :
85 701 : err = PDF_ERROR;
86 701 : len = snprintf (decpt, sizeof (decpt), "%#.0f", 1.0);
87 701 : if (len <= 0 || (pdf_size_t)len >= sizeof (decpt)) /* shouldn't happen */
88 0 : goto fail;
89 :
90 701 : err = PDF_ENOMEM;
91 701 : new_tokr->decimal_point = pdf_alloc (len);
92 701 : if (!new_tokr->decimal_point)
93 0 : goto fail;
94 :
95 : /* this copies the trailing '\0' due to the starting offset */
96 701 : memcpy (new_tokr->decimal_point, &decpt[1], len);
97 : }
98 :
99 : /* buffer_size_min is the default buffer size, which is also the maximum
100 : * size for keywords, names, numbers, etc.; strings and comments will
101 : * enlarge the buffer to whatever size is needed. */
102 701 : new_tokr->buffer_size_min = 32768;
103 701 : new_tokr->buffer = pdf_buffer_new (new_tokr->buffer_size_min);
104 701 : if (!new_tokr->buffer)
105 0 : goto fail;
106 :
107 701 : new_tokr->stream = stm;
108 701 : pdf_token_reader_reset (new_tokr);
109 :
110 701 : *reader = new_tokr;
111 701 : return PDF_OK;
112 :
113 0 : fail:
114 0 : if (new_tokr)
115 : {
116 0 : if (new_tokr->decimal_point)
117 0 : pdf_dealloc (new_tokr->decimal_point);
118 0 : pdf_dealloc (new_tokr);
119 : }
120 :
121 0 : return err;
122 : }
123 :
124 : static void
125 : reset_buffer (pdf_token_reader_t reader)
126 : {
127 2328 : reader->buffer->wp = 0;
128 2328 : if (reader->buffer->size > reader->buffer_size_min)
129 : {
130 : /* Try to shrink the buffer, but don't worry if it fails. */
131 1 : pdf_buffer_resize (reader->buffer, reader->buffer_size_min);
132 : }
133 : }
134 :
135 : pdf_status_t
136 : pdf_token_reader_reset (pdf_token_reader_t reader)
137 701 : {
138 : enter_state (reader, PDF_TOKR_STATE_NONE);
139 701 : reader->substate = 0;
140 : reset_buffer (reader);
141 701 : return PDF_OK;
142 : }
143 :
144 : pdf_status_t
145 : pdf_token_reader_destroy (pdf_token_reader_t reader)
146 516 : {
147 516 : if (!reader) return PDF_EBADDATA;
148 :
149 516 : assert (reader->buffer);
150 516 : if (reader->buffer)
151 516 : pdf_buffer_destroy (reader->buffer);
152 516 : pdf_dealloc (reader->decimal_point);
153 516 : pdf_dealloc (reader);
154 :
155 516 : return PDF_OK;
156 : }
157 :
158 : static INLINE pdf_char_t
159 : hexval (pdf_char_t ch)
160 : {
161 544 : if (ch >= 48 && ch <= 48+9) /* '0'--'9' */
162 347 : return ch - 48;
163 197 : if (ch >= 64+1 && ch <= 64+6) /* 'A'--'F' */
164 1 : return ch - (64+1) + 10;
165 196 : if (ch >= 96+1 && ch <= 96+6) /* 'a'--'f' */
166 196 : return ch - (96+1) + 10;
167 0 : return 255;
168 : }
169 :
170 :
171 : /* Tries to handle the given character and possibly produce a token.
172 : * Sets (*token) if a token is produced, and leaves it unmodified otherwise.
173 : *
174 : * Returns PDF_OK if the character was accepted. Otherwise, an error code
175 : * is returned, and the call can be repeated later with the same ch value.
176 : * A token may be produced even if the character isn't accepted.
177 : */
178 : static pdf_status_t
179 : handle_char (pdf_token_reader_t reader, pdf_u32_t flags,
180 : pdf_char_t ch, pdf_token_t *token)
181 49254 : {
182 : pdf_status_t rv;
183 :
184 : /* first, handle the states that shouldn't be exited when whitespace
185 : * or a delimiter is seen */
186 :
187 49254 : switch (reader->state)
188 : {
189 : case PDF_TOKR_STATE_EOF:
190 1 : return PDF_EEOF;
191 :
192 : case PDF_TOKR_STATE_STRING:
193 84256 : return handle_string_char (reader, flags, ch, token);
194 :
195 : case PDF_TOKR_STATE_HEXSTRING:
196 92 : return handle_hexstring_char (reader, flags, ch, token);
197 :
198 : case PDF_TOKR_STATE_DICTEND:
199 1 : if (ch != 62) /* '>' */
200 0 : return PDF_EBADFILE;
201 1 : reader->substate = 1; /* saw the closing '>' */
202 1 : return exit_state (reader, flags, token);
203 :
204 : case PDF_TOKR_STATE_COMMENT:
205 30 : if (pdf_is_eol_char (ch))
206 : {
207 3 : rv = exit_state (reader, flags, token);
208 3 : if (rv != PDF_OK)
209 0 : return rv;
210 :
211 : /* don't accept this character, but process it next time */
212 3 : return PDF_EAGAIN;
213 : }
214 :
215 12 : if (!(flags & PDF_TOKEN_RET_COMMENTS))
216 12 : reader->substate = 1;
217 12 : if (reader->substate == 1)
218 12 : return PDF_OK; /* we don't care about this comment */
219 :
220 0 : return store_char_grow (reader, ch);
221 :
222 : default: ;
223 : }
224 :
225 : /* now handle delimiters and whitespace */
226 :
227 14126 : if (pdf_is_wspace_char (ch))
228 : {
229 2571 : if (reader->state)
230 : {
231 1403 : rv = exit_state (reader, flags, token);
232 1403 : if (rv != PDF_OK)
233 0 : return rv;
234 :
235 : /* avoid reading this byte so PDF_TOKEN_END_AT_STREAM
236 : * will work properly if it's '\r' */
237 1403 : return PDF_EAGAIN;
238 : }
239 :
240 1168 : if ((flags & PDF_TOKEN_END_AT_STREAM) && ch == 10) /* LF */
241 : {
242 : /* found the beginning of a stream */
243 : enter_state (reader, PDF_TOKR_STATE_EOF);
244 : }
245 1168 : return PDF_OK;
246 : }
247 4492 : else if ((flags & PDF_TOKEN_END_AT_STREAM) && ch != 37) /* 37=='%' */
248 : {
249 : /* only allow whitespace/comments after the "stream" keyword */
250 0 : return PDF_EBADFILE;
251 : }
252 :
253 8984 : if (pdf_is_delim_char (ch))
254 : {
255 : /* set state 0 (UNINIT), substate 0, bufpos 0 */
256 987 : if (reader->state)
257 : {
258 29 : rv = exit_state (reader, flags, token);
259 29 : if (rv != PDF_OK)
260 0 : return rv;
261 29 : return PDF_EAGAIN;
262 : }
263 :
264 958 : switch (ch)
265 : {
266 : case 37: /* '%' */
267 : enter_state (reader, PDF_TOKR_STATE_COMMENT);
268 3 : return PDF_OK;
269 : case 40: /* '(' */
270 : enter_state (reader, PDF_TOKR_STATE_STRING);
271 7 : reader->intparam = 0;
272 7 : return PDF_OK;
273 : case 41: /* ')' */
274 : /* this shouldn't occur outside the STRING and COMMENT states */
275 0 : return PDF_EBADFILE;
276 : case 47: /* '/' */
277 : enter_state (reader, PDF_TOKR_STATE_NAME);
278 512 : return PDF_OK;
279 : case 60: /* '<' */
280 : enter_state (reader, PDF_TOKR_STATE_HEXSTRING);
281 4 : return PDF_OK;
282 : case 62: /* '>' */
283 : enter_state (reader, PDF_TOKR_STATE_DICTEND);
284 1 : return PDF_OK;
285 : case 91: /* '[' */
286 : /* fall through */
287 : case 93: /* ']' */
288 : /* fall through */
289 : case 123: /* '{' */
290 : /* fall through */
291 : case 125: /* '}' */
292 : /* exit_state may have emitted a token, so we can't emit another
293 : * one now; we'll do it when exiting the PENDING state */
294 : enter_state (reader, PDF_TOKR_STATE_PENDING);
295 431 : reader->charparam = ch;
296 431 : return PDF_OK;
297 : }
298 :
299 : /* not reached (all delimiter chars should be handled) */
300 0 : assert (0);
301 : }
302 :
303 : /* ch is a regular character */
304 :
305 3505 : switch (reader->state)
306 : {
307 : case PDF_TOKR_STATE_PENDING:
308 1 : rv = exit_state (reader, flags, token);
309 1 : if (rv != PDF_OK)
310 0 : return rv;
311 1 : return PDF_EAGAIN;
312 :
313 : case PDF_TOKR_STATE_NONE:
314 : enter_state (reader, PDF_TOKR_STATE_KEYWORD);
315 : /* fall through */
316 :
317 : case PDF_TOKR_STATE_KEYWORD:
318 : /* Note: numbers are treated as keywords until flush_token is called. */
319 4944 : return store_char (reader, ch);
320 :
321 : case PDF_TOKR_STATE_NAME:
322 1032 : if (reader->substate == 0)
323 : {
324 518 : if ((ch < 0x21) || (ch > 0x7e))
325 : {
326 : /* Invalid character in a name. */
327 156 : return PDF_EBADFILE;
328 : }
329 :
330 362 : if (ch != 35 /* '#' */
331 : || (flags & PDF_TOKEN_NO_NAME_ESCAPES) )
332 210 : return store_char (reader, ch);
333 :
334 257 : reader->substate = 1;
335 257 : return PDF_OK;
336 : }
337 :
338 1028 : if ( (ch = hexval (ch)) >= 16 )
339 0 : return PDF_EBADFILE;
340 :
341 514 : if (reader->substate == 1) /* the first hex digit of an escape */
342 : {
343 257 : reader->substate = 2;
344 257 : reader->charparam = ch;
345 257 : return PDF_OK;
346 : }
347 :
348 257 : ch = (reader->charparam << 4) | ch;
349 257 : if (ch == 0) /* the PDF spec forbids "#00" */
350 0 : return PDF_EBADFILE;
351 :
352 514 : rv = store_char (reader, ch);
353 257 : if (rv == PDF_OK) reader->substate = 0;
354 257 : return rv;
355 :
356 : default:
357 0 : assert (0);
358 : return PDF_ERROR;
359 : }
360 : }
361 :
362 :
363 : static INLINE int
364 : can_store_char (const pdf_token_reader_t reader)
365 : {
366 44943 : return reader->buffer->wp < reader->buffer->size;
367 : }
368 :
369 : static pdf_status_t
370 : enlarge_buffer (pdf_token_reader_t reader)
371 : {
372 1 : pdf_size_t size = reader->buffer->size, newsize = size * 2;
373 1 : if (newsize < size)
374 0 : return PDF_EIMPLLIMIT;
375 :
376 1 : return pdf_buffer_resize (reader->buffer, newsize);
377 : }
378 :
379 : static INLINE pdf_status_t
380 : store_char (pdf_token_reader_t reader, pdf_char_t ch)
381 : {
382 2834 : if (!can_store_char (reader))
383 0 : return PDF_EIMPLLIMIT;
384 2834 : reader->buffer->data[reader->buffer->wp++] = ch;
385 2834 : return PDF_OK;
386 : }
387 :
388 : static INLINE pdf_status_t
389 : store_char_grow (pdf_token_reader_t reader, pdf_char_t ch)
390 42109 : {
391 42109 : if (!can_store_char (reader))
392 : {
393 1 : pdf_status_t rv = enlarge_buffer(reader);
394 1 : if (rv != PDF_OK)
395 0 : return rv;
396 : }
397 42109 : reader->buffer->data[reader->buffer->wp++] = ch;
398 42109 : return PDF_OK;
399 : }
400 :
401 : static INLINE pdf_status_t
402 : enter_state (pdf_token_reader_t reader,
403 : enum pdf_token_reader_state_e state)
404 : {
405 2485 : reader->state = state;
406 2485 : reader->state_pos = pdf_stm_tell (reader->stream);
407 :
408 2485 : return PDF_OK;
409 : }
410 :
411 : static pdf_status_t
412 : flush_token (pdf_token_reader_t reader, pdf_u32_t flags, pdf_token_t *token)
413 1806 : {
414 : pdf_status_t rv;
415 : pdf_token_t new_tok;
416 1806 : pdf_char_t *data = reader->buffer->data;
417 1806 : int datasize = reader->buffer->wp;
418 :
419 1806 : switch (reader->state)
420 : {
421 : case PDF_TOKR_STATE_NONE:
422 3 : return PDF_OK; /* no state to exit */
423 :
424 : case PDF_TOKR_STATE_EOF:
425 176 : return PDF_EEOF; /* can't continue parsing after EOF */
426 :
427 : case PDF_TOKR_STATE_COMMENT:
428 3 : if ((reader->substate == 1) || !(flags & PDF_TOKEN_RET_COMMENTS))
429 : goto finish; /* don't return a token */
430 :
431 0 : rv = pdf_token_comment_new (data, datasize, &new_tok);
432 0 : break;
433 :
434 : case PDF_TOKR_STATE_KEYWORD:
435 : {
436 : int value;
437 1650 : int ntyp = recognise_number (reader->buffer, &value);
438 825 : if (ntyp == 1)
439 173 : rv = pdf_token_integer_new (value, &new_tok);
440 652 : else if (ntyp == 2)
441 : {
442 : double realvalue;
443 26 : rv = parse_real (reader->buffer,
444 : reader->decimal_point,
445 : &realvalue);
446 13 : if (rv != PDF_OK)
447 0 : return rv;
448 13 : rv = pdf_token_real_new ((float)realvalue, &new_tok);
449 : }
450 : else
451 639 : rv = pdf_token_keyword_new (data, datasize, &new_tok);
452 : }
453 : break;
454 :
455 : case PDF_TOKR_STATE_NAME:
456 356 : if (reader->substate != 0) /* reading an escape sequence */
457 0 : return PDF_EBADFILE;
458 :
459 356 : rv = pdf_token_name_new (data, datasize, &new_tok);
460 356 : break;
461 :
462 : case PDF_TOKR_STATE_STRING:
463 7 : if (reader->intparam >= 0) /* didn't see the closing ')' */
464 0 : return PDF_EBADFILE;
465 :
466 7 : rv = pdf_token_string_new (data, datasize, &new_tok);
467 7 : break;
468 :
469 : case PDF_TOKR_STATE_HEXSTRING:
470 3 : if (reader->substate != 3) /* didn't see the closing '>' */
471 0 : return PDF_EBADFILE;
472 :
473 3 : rv = pdf_token_string_new (data, datasize, &new_tok);
474 3 : break;
475 :
476 : case PDF_TOKR_STATE_DICTEND:
477 1 : if (reader->substate != 1) /* didn't see a second '>' */
478 0 : return PDF_EBADFILE;
479 :
480 1 : rv = pdf_token_valueless_new (PDF_TOKEN_DICT_END, &new_tok);
481 1 : break;
482 :
483 : case PDF_TOKR_STATE_PENDING:
484 432 : switch (reader->charparam)
485 : {
486 : case 60: /* '<' */
487 1 : rv = pdf_token_valueless_new (PDF_TOKEN_DICT_START, &new_tok);
488 1 : break;
489 : case 91: /* '[' */
490 1 : rv = pdf_token_valueless_new (PDF_TOKEN_ARRAY_START, &new_tok);
491 1 : break;
492 : case 93: /* ']' */
493 1 : rv = pdf_token_valueless_new (PDF_TOKEN_ARRAY_END, &new_tok);
494 1 : break;
495 : case 123: /* '{' */
496 215 : rv = pdf_token_valueless_new (PDF_TOKEN_PROC_START, &new_tok);
497 215 : break;
498 : case 125: /* '}' */
499 214 : rv = pdf_token_valueless_new (PDF_TOKEN_PROC_END, &new_tok);
500 214 : break;
501 : default:
502 0 : assert (0);
503 : return PDF_ERROR;
504 : }
505 : break;
506 :
507 : default:
508 0 : assert (0);
509 : return PDF_ERROR;
510 : }
511 :
512 1624 : if (rv != PDF_OK)
513 0 : return rv;
514 :
515 1624 : *token = new_tok;
516 :
517 : /* Set the beginning position of this state */
518 1624 : reader->beg_pos = reader->state_pos;
519 :
520 1627 : finish:
521 : reset_buffer (reader);
522 1627 : return PDF_OK;
523 : }
524 :
525 :
526 : static pdf_status_t
527 : exit_state (pdf_token_reader_t reader, pdf_u32_t flags, pdf_token_t *token)
528 1806 : {
529 1806 : pdf_status_t rv = flush_token (reader, flags, token);
530 1806 : if (rv == PDF_OK)
531 : {
532 1630 : reader->state = PDF_TOKR_STATE_NONE;
533 1630 : reader->substate = 0;
534 : }
535 1806 : return rv;
536 : }
537 :
538 :
539 : static INLINE pdf_status_t
540 : handle_string_char (pdf_token_reader_t reader,
541 : pdf_u32_t flags,
542 : pdf_char_t ch,
543 : pdf_token_t *token)
544 : {
545 : pdf_status_t rv;
546 42129 : start:
547 42129 : switch (reader->substate)
548 : {
549 : case 1: /* ignore LF */
550 6 : reader->substate = 0;
551 6 : if (ch == 10)
552 3 : return PDF_OK;
553 : /* fall through */
554 :
555 : case 0: /* no special state */
556 : {
557 42104 : if (ch == 92) /* '\\' */
558 : {
559 18 : reader->substate = 2; /* start an escape sequence */
560 18 : return PDF_OK;
561 : }
562 42086 : else if (ch == 41 && reader->intparam <= 0) /* ')'; end of string */
563 : {
564 7 : reader->intparam = -1;
565 7 : return exit_state (reader, flags, token);
566 : }
567 :
568 42079 : pdf_bool_t wasCR = (ch == 13);
569 42079 : if (wasCR)
570 3 : ch = 10; /* treat as LF */
571 42079 : rv = store_char_grow (reader, ch);
572 :
573 42079 : if (rv == PDF_OK)
574 : {
575 42079 : if (wasCR) /* '\r' */
576 3 : reader->substate = 1; /* ignore the next char if it's LF */
577 42076 : else if (ch == 40) /* '(' */
578 2 : ++reader->intparam;
579 42074 : else if (ch == 41) /* ')' */
580 2 : --reader->intparam;
581 : }
582 :
583 42079 : return rv;
584 : }
585 :
586 : case 2: /* just saw a '\\' (starting an escape sequence) */
587 18 : reader->substate = 0;
588 18 : if (ch == 98) /* 'b' */
589 1 : ch = 8; /* BS: backspace */
590 17 : else if (ch == 102) /* 'f' */
591 1 : ch = 12; /* FF: formfeed */
592 16 : else if (ch == 110) /* 'n' */
593 1 : ch = 10; /* NL: newline */
594 15 : else if (ch == 114) /* 'r' */
595 1 : ch = 13; /* CR: carriage return */
596 14 : else if (ch == 116) /* 't' */
597 1 : ch = 9; /* HT: horizontal tab */
598 13 : else if (ch == 10) /* NL */
599 1 : return PDF_OK; /* ignore the line break */
600 12 : else if (ch == 13) /* CR */
601 : {
602 : /* ignore the line break; also ignore the next byte if it's LF */
603 3 : reader->substate = 1;
604 3 : return PDF_OK;
605 : }
606 9 : else if (ch >= 48 && ch <= 48+7) /* digits '0'--'7' */
607 : {
608 : /* starting an octal escape; we'll read three digits even if the
609 : * first is '4'--'7' (and calculate the final char modulo 256),
610 : * since the PDF/PS specs say to ignore high-order overflow */
611 2 : reader->substate = 3;
612 2 : reader->charparam = (ch-48);
613 2 : return PDF_OK;
614 : }
615 :
616 : /* for any other character, including '(', ')', and '\\',
617 : * store the same character (dropping the leading backslash) */
618 12 : return store_char_grow (reader, ch);
619 :
620 : case 3: /* saw 1 digit of an octal escape */
621 : /* fall through */
622 : case 4: /* saw 2 digits of an octal escape */
623 4 : if (ch < 48 || ch > 48+7) /* not digits '0'--'7' */
624 : {
625 1 : rv = store_char_grow (reader, reader->charparam);
626 1 : if (rv != PDF_OK) return rv;
627 :
628 : /* ch isn't part of the escape sequence, so retry */
629 1 : reader->substate = 0;
630 : goto start;
631 : }
632 :
633 : /* ch is a digit from '0'--'7' */
634 3 : reader->charparam = ((reader->charparam & 0x1f) << 3) | (ch - 48);
635 3 : if (reader->substate == 4) /* this was the final digit */
636 : {
637 1 : rv = store_char_grow (reader, reader->charparam);
638 1 : if (rv != PDF_OK) return rv;
639 :
640 1 : reader->substate = 0;
641 1 : return PDF_OK;
642 : }
643 :
644 2 : reader->substate = 4;
645 2 : return PDF_OK;
646 :
647 : default:
648 0 : assert (0);
649 : }
650 : }
651 :
652 :
653 : static INLINE pdf_status_t
654 : handle_hexstring_char (pdf_token_reader_t reader,
655 : pdf_u32_t flags,
656 : pdf_char_t ch,
657 : pdf_token_t *token)
658 : {
659 : pdf_status_t rv;
660 :
661 46 : if (reader->substate == 0)
662 : {
663 : /* this is the first character after the initial '<' */
664 4 : if (ch == 60) /* '<' */
665 : {
666 : /* this was actually the start of a dictionary */
667 1 : reader->state = PDF_TOKR_STATE_PENDING;
668 1 : reader->charparam = ch;
669 1 : return exit_state (reader, flags, token);
670 : }
671 :
672 3 : reader->substate = 1;
673 : }
674 :
675 90 : if (pdf_is_wspace_char (ch))
676 12 : return PDF_OK;
677 :
678 33 : if (ch == 62) /* '>': end of hex string */
679 : {
680 3 : if (reader->substate == 2)
681 : {
682 : /* the last digit is missing; assume it's '0' */
683 2 : rv = store_char_grow (reader, reader->charparam << 4);
684 2 : if (rv != PDF_OK) return rv;
685 : }
686 :
687 3 : reader->substate = 3; /* saw end of string */
688 3 : return exit_state (reader, flags, token);
689 : }
690 :
691 60 : if ( (ch = hexval (ch)) == 255 )
692 0 : return PDF_EBADFILE;
693 :
694 30 : if (reader->substate == 1) /* first character in a pair */
695 : {
696 16 : reader->substate = 2;
697 16 : reader->charparam = ch;
698 16 : return PDF_OK;
699 : }
700 :
701 14 : rv = store_char_grow (reader, (reader->charparam << 4) | ch);
702 14 : if (rv == PDF_OK)
703 14 : reader->substate = 1;
704 14 : return rv;
705 : }
706 :
707 : pdf_status_t
708 : pdf_token_read (pdf_token_reader_t reader, pdf_u32_t flags, pdf_token_t *token)
709 1960 : {
710 : pdf_status_t rv;
711 : pdf_char_t ch;
712 1960 : pdf_token_t new_token = NULL;
713 :
714 1960 : if (!reader || !reader->stream || !token)
715 0 : return PDF_EBADDATA;
716 :
717 49612 : while ( (rv = pdf_stm_peek_char (reader->stream, &ch)) == PDF_OK )
718 : {
719 49254 : rv = handle_char (reader, flags, ch, &new_token);
720 49254 : if (rv == PDF_OK)
721 : {
722 : /* The character we peeked at was accepted, so get rid of it. */
723 47661 : pdf_stm_read_char (reader->stream, &ch);
724 : }
725 :
726 49254 : if (new_token)
727 : {
728 : /* Don't return an error code if we got a valid token.
729 : * We'll probably see the same error on the next call since we
730 : * didn't call read_char. */
731 1445 : assert (rv == PDF_OK || rv == PDF_EAGAIN);
732 : goto ret_token;
733 : }
734 47809 : else if (rv != PDF_OK && rv != PDF_EAGAIN)
735 157 : return rv;
736 : }
737 :
738 : /* peek_char returned an error code (rv) */
739 358 : if (rv != PDF_EEOF)
740 0 : return rv;
741 :
742 358 : rv = exit_state (reader, flags, &new_token);
743 358 : if (rv != PDF_OK)
744 176 : return rv;
745 :
746 182 : reader->state = PDF_TOKR_STATE_EOF;
747 182 : if (new_token)
748 179 : goto ret_token;
749 : else
750 3 : return PDF_EEOF;
751 :
752 1624 : ret_token:
753 1624 : assert (new_token);
754 1624 : *token = new_token;
755 1624 : return PDF_OK;
756 : }
757 :
758 : pdf_size_t
759 : pdf_token_reader_begin_pos (pdf_token_reader_t reader)
760 1422 : {
761 1422 : return reader->beg_pos;
762 : }
763 :
764 : static INLINE int
765 : parse_integer (pdf_buffer_t buffer, int *int_value, int *int_state)
766 : {
767 : /* Parse an ASCII integer with the given radix, at the beginning of
768 : * the buffer (possibly leaving unread bytes at the end).
769 : *
770 : * Return value is 0 on failure, or a bitmask otherwise:
771 : * 1 = valid integer
772 : * 2 = signed
773 : * 4 = overflowed (no value stored in *int_value)
774 : */
775 :
776 825 : int sign = 0, tmpint = 0, overflowed = 0, ret;
777 : /* Integer states (int_state):
778 : * 0 = at start (looking for sign or digits)
779 : * 1 = saw sign
780 : * 2 = saw digits
781 : */
782 :
783 825 : *int_state = 0;
784 237 : for (; buffer->rp < buffer->wp; ++buffer->rp)
785 : {
786 : int chval;
787 888 : pdf_char_t ch = buffer->data[buffer->rp];
788 888 : if (ch == 43 || ch == 45) /* '+','-' */
789 : {
790 0 : if (*int_state != 0)
791 : goto out;
792 :
793 0 : *int_state = 1;
794 0 : sign = (ch == 43) ? 1 : -1;
795 : continue;
796 : }
797 :
798 888 : chval = ch - 48; /* assume this is a digit */
799 888 : if (chval < 0 || chval > 9)
800 : goto out; /* not a valid number */
801 :
802 237 : *int_state = 2;
803 237 : if (overflowed)
804 : continue;
805 :
806 : /* convert the digits to an integer, if possible */
807 223 : if (sign < 0)
808 : {
809 0 : chval = -chval;
810 0 : if ( tmpint < (INT_MIN/10)
811 : || (tmpint == (INT_MIN/10) && chval < (INT_MIN%10)) )
812 : {
813 0 : overflowed = 1; /* would overflow */
814 : continue;
815 : }
816 : }
817 : else
818 : {
819 223 : if ( tmpint > (INT_MAX/10)
820 : || (tmpint == (INT_MAX/10) && chval > (INT_MAX%10)) )
821 : {
822 1 : overflowed = 1; /* would overflow */
823 : continue;
824 : }
825 : }
826 :
827 222 : tmpint += chval + (tmpint * 9);
828 : }
829 :
830 825 : out:
831 825 : if (*int_state != 2)
832 639 : return 0; /* never saw any digits */
833 :
834 186 : ret = 1;
835 186 : if (sign) ret += 2;
836 186 : if (overflowed)
837 1 : ret += 4;
838 : else
839 185 : *int_value = tmpint;
840 :
841 186 : return ret;
842 : }
843 :
844 :
845 : static INLINE int
846 : validate_real (pdf_buffer_t buffer, int int_state)
847 : {
848 : /* Determines whether the given number is a valid PS/PDF real number;
849 : * assumes the initial sign was already read (if present), and any data
850 : * before buffer->rp is a valid integer.
851 : *
852 : * Return value:
853 : * 0 = not a real number
854 : * 1 = valid PDF/PS real
855 : */
856 :
857 651 : int seen_point = 0;
858 : /* Integer states (int_state):
859 : * 0 = at start
860 : * 1 = saw sign
861 : * 2 = saw digits
862 : */
863 :
864 29 : for (; buffer->rp < buffer->wp; ++buffer->rp)
865 : {
866 668 : pdf_char_t ch = buffer->data[buffer->rp];
867 668 : if (ch == 46) /* '.' */
868 : {
869 12 : if (!seen_point)
870 12 : seen_point = 1;
871 : else
872 0 : return 0;
873 : }
874 656 : else if (ch == 43 || ch == 45) /* '+','-' */
875 : {
876 0 : if (int_state == 0)
877 0 : int_state = 1;
878 : else
879 0 : return 0;
880 : }
881 656 : else if (ch >= 48+0 && ch <= 48+9)
882 17 : int_state = 2;
883 : else
884 639 : return 0;
885 : }
886 :
887 12 : return (int_state == 2); /* only valid if we saw a digit */
888 : }
889 :
890 :
891 : /* Given a buffer containing a validated PDF real (in ASCII), convert it to a
892 : * double by translating it to the execution character set, replacing '.' with
893 : * the locale's decimal point, and calling strtod. */
894 : static INLINE pdf_status_t
895 : parse_real (pdf_buffer_t buffer, char *locale_dec_pt, double *value)
896 : {
897 : pdf_status_t ret;
898 : size_t tmplen, wpos, ptlen;
899 : char *tmp, *endptr;
900 :
901 13 : ptlen = strlen (locale_dec_pt);
902 : /* we may remove 1 byte ('.') and replace it with ptlen bytes */
903 13 : tmplen = buffer->wp - 1 + ptlen;
904 :
905 13 : tmp = pdf_alloc (tmplen + 1);
906 13 : if (!tmp)
907 0 : return PDF_ENOMEM;
908 :
909 13 : wpos = 0;
910 13 : ret = PDF_ERROR; /* nothing should fail if the buffer was validated */
911 79 : for (buffer->rp = 0; buffer->rp < buffer->wp; ++buffer->rp)
912 : {
913 66 : pdf_char_t ch = buffer->data[buffer->rp];
914 66 : if (wpos >= tmplen)
915 : goto out;
916 :
917 66 : if (ch == 46) /* '.' */
918 : {
919 12 : if (wpos + ptlen > tmplen)
920 : goto out;
921 :
922 12 : memcpy (tmp + wpos, locale_dec_pt, ptlen);
923 12 : wpos += ptlen;
924 : }
925 54 : else if (ch == 43) /* '+' */
926 0 : tmp[wpos++] = '+';
927 54 : else if (ch == 45) /* '-' */
928 0 : tmp[wpos++] = '-';
929 54 : else if (ch >= 48+0 && ch <= 48+9) /* '0'--'9' */
930 54 : tmp[wpos++] = '0' + (ch-48);
931 : else
932 : goto out;
933 : }
934 :
935 : /* null-terminate the new string, and call strtod to get its value
936 : * (strtof would also work if it's available) */
937 13 : tmp[wpos] = '\0';
938 13 : *value = strtod (tmp, &endptr);
939 13 : if (endptr == tmp + wpos)
940 13 : ret = PDF_OK;
941 :
942 13 : out:
943 13 : pdf_dealloc (tmp);
944 13 : return ret;
945 : }
946 :
947 :
948 : /*
949 : * Return value:
950 : * 0 = not a number
951 : * 1 = integer (stored in *int_value)
952 : * 2 = real
953 : */
954 : static int
955 : recognise_number (pdf_buffer_t buffer, int *int_value)
956 : {
957 825 : int rv, tmpint = 0, int_state = 0;
958 :
959 : /* try to parse as an integer */
960 :
961 825 : buffer->rp = 0;
962 825 : rv = parse_integer (buffer, &tmpint, &int_state);
963 :
964 825 : if (buffer->rp < buffer->wp) /* didn't look at the whole buffer */
965 : {
966 1302 : rv = validate_real (buffer, int_state);
967 651 : if (rv == 1)
968 12 : return 2;
969 :
970 639 : return 0;
971 : }
972 :
973 174 : if (!rv) return 0;
974 174 : else if (rv & 4)
975 1 : return 2; /* integer overflowed, but could be read as a real */
976 :
977 173 : *int_value = tmpint;
978 173 : return 1;
979 : }
980 :
981 :
982 : /* End of pdf-token-reader.c */
|