Libparserutils
codec_8859.c
Go to the documentation of this file.
1/*
2 * This file is part of LibParserUtils.
3 * Licensed under the MIT License,
4 * http://www.opensource.org/licenses/mit-license.php
5 * Copyright 2008 John-Mark Bell <jmb@netsurf-browser.org>
6 */
7
8#include <assert.h>
9#include <stdlib.h>
10#include <string.h>
11
13
15#include "utils/endian.h"
16#include "utils/utils.h"
17
19
20static struct {
21 uint16_t mib;
22 const char *name;
23 size_t len;
24 uint32_t *table;
25} known_charsets[] = {
26 { 0, "ISO-8859-1", SLEN("ISO-8859-1"), t1 },
27 { 0, "ISO-8859-2", SLEN("ISO-8859-2"), t2 },
28 { 0, "ISO-8859-3", SLEN("ISO-8859-3"), t3 },
29 { 0, "ISO-8859-4", SLEN("ISO-8859-4"), t4 },
30 { 0, "ISO-8859-5", SLEN("ISO-8859-5"), t5 },
31 { 0, "ISO-8859-6", SLEN("ISO-8859-6"), t6 },
32 { 0, "ISO-8859-7", SLEN("ISO-8859-7"), t7 },
33 { 0, "ISO-8859-8", SLEN("ISO-8859-8"), t8 },
34 { 0, "ISO-8859-9", SLEN("ISO-8859-9"), t9 },
35 { 0, "ISO-8859-10", SLEN("ISO-8859-10"), t10 },
36 { 0, "ISO-8859-11", SLEN("ISO-8859-11"), t11 },
37 { 0, "ISO-8859-13", SLEN("ISO-8859-13"), t13 },
38 { 0, "ISO-8859-14", SLEN("ISO-8859-14"), t14 },
39 { 0, "ISO-8859-15", SLEN("ISO-8859-15"), t15 },
40 { 0, "ISO-8859-16", SLEN("ISO-8859-16"), t16 }
41};
42
46typedef struct charset_8859_codec {
48
49 uint32_t *table;
50
51#define READ_BUFSIZE (8)
55 size_t read_len;
56
57#define WRITE_BUFSIZE (8)
61 size_t write_len;
62
64
65static bool charset_8859_codec_handles_charset(const char *charset);
66static parserutils_error charset_8859_codec_create(const char *charset,
72 const uint8_t **source, size_t *sourcelen,
73 uint8_t **dest, size_t *destlen);
76 const uint8_t **source, size_t *sourcelen,
77 uint8_t **dest, size_t *destlen);
82 const uint8_t **source, size_t *sourcelen,
83 uint8_t **dest, size_t *destlen);
86 uint32_t ucs4, uint8_t **dest, size_t *destlen);
88 uint32_t ucs4, uint8_t **s, size_t *len);
90 const uint8_t *s, size_t len, uint32_t *ucs4);
91
98bool charset_8859_codec_handles_charset(const char *charset)
99{
100 uint32_t i;
101 uint16_t match = parserutils_charset_mibenum_from_name(charset,
102 strlen(charset));
103
104 if (known_charsets[0].mib == 0) {
105 for (i = 0; i < N_ELEMENTS(known_charsets); i++) {
106 known_charsets[i].mib =
110 }
111 }
112
113 for (i = 0; i < N_ELEMENTS(known_charsets); i++) {
114 if (known_charsets[i].mib == match)
115 return true;
116 }
117
118 return false;
119}
120
132{
133 uint32_t i;
136 charset, strlen(charset));
137 uint32_t *table = NULL;
138
139 for (i = 0; i < N_ELEMENTS(known_charsets); i++) {
140 if (known_charsets[i].mib == match) {
141 table = known_charsets[i].table;
142 break;
143 }
144 }
145
146 assert(table != NULL);
147
148 c = malloc(sizeof(charset_8859_codec));
149 if (c == NULL)
150 return PARSERUTILS_NOMEM;
151
152 c->table = table;
153
154 c->read_buf[0] = 0;
155 c->read_len = 0;
156
157 c->write_buf[0] = 0;
158 c->write_len = 0;
159
160 /* Finally, populate vtable */
165
166 *codec = (parserutils_charset_codec *) c;
167
168 return PARSERUTILS_OK;
169}
170
183
212 const uint8_t **source, size_t *sourcelen,
213 uint8_t **dest, size_t *destlen)
214{
216 uint32_t ucs4;
217 uint32_t *towrite;
218 size_t towritelen;
219 parserutils_error error;
220
221 /* Process any outstanding characters from the previous call */
222 if (c->write_len > 0) {
223 uint32_t *pwrite = c->write_buf;
224
225 while (c->write_len > 0) {
226 error = charset_8859_from_ucs4(c, pwrite[0],
227 dest, destlen);
228 if (error != PARSERUTILS_OK) {
229 uint32_t len;
230 assert(error == PARSERUTILS_NOMEM);
231
232 for (len = 0; len < c->write_len; len++) {
233 c->write_buf[len] = pwrite[len];
234 }
235
236 return error;
237 }
238
239 pwrite++;
240 c->write_len--;
241 }
242 }
243
244 /* Now process the characters for this call */
245 while (*sourcelen > 0) {
246 ucs4 = endian_big_to_host(*((uint32_t *) (void *) *source));
247 towrite = &ucs4;
248 towritelen = 1;
249
250 /* Output current characters */
251 while (towritelen > 0) {
252 error = charset_8859_from_ucs4(c, towrite[0], dest,
253 destlen);
254 if (error != PARSERUTILS_OK) {
255 uint32_t len;
256 if (error != PARSERUTILS_NOMEM) {
257 return error;
258 }
259
260 /* Insufficient output space */
261 assert(towritelen < WRITE_BUFSIZE);
262
263 c->write_len = towritelen;
264
265 /* Copy pending chars to save area, for
266 * processing next call. */
267 for (len = 0; len < towritelen; len++)
268 c->write_buf[len] = towrite[len];
269
270 /* Claim character we've just buffered,
271 * so it's not reprocessed */
272 *source += 4;
273 *sourcelen -= 4;
274
275 return PARSERUTILS_NOMEM;
276 }
277
278 towrite++;
279 towritelen--;
280 }
281
282 *source += 4;
283 *sourcelen -= 4;
284 }
285
286 return PARSERUTILS_OK;
287}
288
331 const uint8_t **source, size_t *sourcelen,
332 uint8_t **dest, size_t *destlen)
333{
335 parserutils_error error;
336
337 if (c->read_len > 0) {
338 /* Output left over from last decode */
339 uint32_t *pread = c->read_buf;
340
341 while (c->read_len > 0 && *destlen >= c->read_len * 4) {
342 *((uint32_t *) (void *) *dest) =
343 endian_host_to_big(pread[0]);
344
345 *dest += 4;
346 *destlen -= 4;
347
348 pread++;
349 c->read_len--;
350 }
351
352 if (*destlen < c->read_len * 4) {
353 /* Ran out of output buffer */
354 size_t i;
355
356 /* Shuffle remaining output down */
357 for (i = 0; i < c->read_len; i++)
358 c->read_buf[i] = pread[i];
359
360 return PARSERUTILS_NOMEM;
361 }
362 }
363
364 /* Finally, the "normal" case; process all outstanding characters */
365 while (*sourcelen > 0) {
367 source, sourcelen, dest, destlen);
368 if (error != PARSERUTILS_OK) {
369 return error;
370 }
371 }
372
373 return PARSERUTILS_OK;
374}
375
383{
385
386 c->read_buf[0] = 0;
387 c->read_len = 0;
388
389 c->write_buf[0] = 0;
390 c->write_len = 0;
391
392 return PARSERUTILS_OK;
393}
394
395
425 const uint8_t **source, size_t *sourcelen,
426 uint8_t **dest, size_t *destlen)
427{
428 uint32_t ucs4;
429 parserutils_error error;
430
431 /* Convert a single character */
432 error = charset_8859_to_ucs4(c, *source, *sourcelen, &ucs4);
433 if (error == PARSERUTILS_OK) {
434 /* Read a character */
436 ucs4, dest, destlen);
437 if (error == PARSERUTILS_OK || error == PARSERUTILS_NOMEM) {
438 /* output succeeded; update source pointers */
439 *source += 1;
440 *sourcelen -= 1;
441 }
442
443 return error;
444 } else if (error == PARSERUTILS_NEEDDATA) {
445 /* Can only happen if sourcelen == 0 */
446 return error;
447 } else if (error == PARSERUTILS_INVALID) {
448 /* Illegal input sequence */
449
450 /* Strict errormode; simply flag invalid character */
451 if (c->base.errormode ==
453 return PARSERUTILS_INVALID;
454 }
455
456 /* output U+FFFD and continue processing. */
458 0xFFFD, dest, destlen);
459 if (error == PARSERUTILS_OK || error == PARSERUTILS_NOMEM) {
460 /* output succeeded; update source pointers */
461 *source += 1;
462 *sourcelen -= 1;
463 }
464
465 return error;
466 }
467
468 return PARSERUTILS_OK;
469}
470
482 uint32_t ucs4, uint8_t **dest, size_t *destlen)
483{
484 if (*destlen < 4) {
485 /* Run out of output buffer */
486 c->read_len = 1;
487 c->read_buf[0] = ucs4;
488
489 return PARSERUTILS_NOMEM;
490 }
491
492 *((uint32_t *) (void *) *dest) = endian_host_to_big(ucs4);
493 *dest += 4;
494 *destlen -= 4;
495
496 return PARSERUTILS_OK;
497}
498
516 uint32_t ucs4, uint8_t **s, size_t *len)
517{
518 uint8_t out = 0;
519
520 if (*len < 1)
521 return PARSERUTILS_NOMEM;
522
523 if (ucs4 < 0x80) {
524 /* ASCII */
525 out = ucs4;
526 } else {
527 uint32_t i;
528
529 for (i = 0; i < 96; i++) {
530 if (ucs4 == c->table[i])
531 break;
532 }
533
534 if (i == 96) {
535 if (c->base.errormode ==
537 return PARSERUTILS_INVALID;
538 else
539 out = '?';
540 } else {
541 out = 0xA0 + i;
542 }
543 }
544
545 *(*s) = out;
546 (*s)++;
547 (*len)--;
548
549 return PARSERUTILS_OK;
550}
551
564 const uint8_t *s, size_t len, uint32_t *ucs4)
565{
566 uint32_t out;
567
568 if (len < 1)
570
571 if (*s < 0x80) {
572 out = *s;
573 } else if (*s >= 0xA0) {
574 if (c->table[*s - 0xA0] == 0xFFFF)
575 return PARSERUTILS_INVALID;
576
577 out = c->table[*s - 0xA0];
578 } else {
579 return PARSERUTILS_INVALID;
580 }
581
582 *ucs4 = out;
583
584 return PARSERUTILS_OK;
585}
586
591
static uint32_t t9[96]
static uint32_t t10[96]
static uint32_t t11[96]
static uint32_t t13[96]
static uint32_t t1[96]
Definition 8859_tables.h:16
static uint32_t t6[96]
Definition 8859_tables.h:91
static uint32_t t15[96]
static uint32_t t3[96]
Definition 8859_tables.h:46
static uint32_t t16[96]
static uint32_t t7[96]
static uint32_t t4[96]
Definition 8859_tables.h:61
static uint32_t t2[96]
Definition 8859_tables.h:31
static uint32_t t5[96]
Definition 8859_tables.h:76
static uint32_t t8[96]
static uint32_t t14[96]
parserutils_charset_handler charset_8859_codec_handler
Definition codec_8859.c:587
@ PARSERUTILS_CHARSET_CODEC_ERROR_STRICT
Abort processing if unrepresentable character encountered.
Definition codec.h:64
uint32_t * table
Definition codec_8859.c:24
static parserutils_error charset_8859_codec_encode(parserutils_charset_codec *codec, const uint8_t **source, size_t *sourcelen, uint8_t **dest, size_t *destlen)
Encode a chunk of UCS-4 (big endian) data into ISO-8859-n.
Definition codec_8859.c:211
static parserutils_error charset_8859_codec_create(const char *charset, parserutils_charset_codec **codec)
Create an ISO-8859-n codec.
Definition codec_8859.c:130
static parserutils_error charset_8859_to_ucs4(charset_8859_codec *c, const uint8_t *s, size_t len, uint32_t *ucs4)
Convert an ISO-8859-n character to UCS4 (host endian)
Definition codec_8859.c:563
static parserutils_error charset_8859_codec_decode(parserutils_charset_codec *codec, const uint8_t **source, size_t *sourcelen, uint8_t **dest, size_t *destlen)
Decode a chunk of ISO-8859-n data into UCS-4 (big endian)
Definition codec_8859.c:330
#define READ_BUFSIZE
Definition codec_8859.c:51
size_t len
Definition codec_8859.c:23
uint16_t mib
Definition codec_8859.c:21
const char * name
Definition codec_8859.c:22
static struct @253173311160310370314263334127070070016153225302 known_charsets[]
static parserutils_error charset_8859_codec_reset(parserutils_charset_codec *codec)
Clear an ISO-8859-n codec's encoding state.
Definition codec_8859.c:382
static parserutils_error charset_8859_codec_output_decoded_char(charset_8859_codec *c, uint32_t ucs4, uint8_t **dest, size_t *destlen)
Output a UCS-4 character (big endian)
Definition codec_8859.c:481
static parserutils_error charset_8859_codec_destroy(parserutils_charset_codec *codec)
Destroy an ISO-8859-n codec.
Definition codec_8859.c:177
#define WRITE_BUFSIZE
Definition codec_8859.c:57
static parserutils_error charset_8859_from_ucs4(charset_8859_codec *c, uint32_t ucs4, uint8_t **s, size_t *len)
Convert a UCS4 (host endian) character to ISO-8859-n.
Definition codec_8859.c:515
static parserutils_error charset_8859_codec_read_char(charset_8859_codec *c, const uint8_t **source, size_t *sourcelen, uint8_t **dest, size_t *destlen)
Read a character from the ISO-8859-n to UCS-4 (big endian)
Definition codec_8859.c:424
static bool charset_8859_codec_handles_charset(const char *charset)
Determine whether this codec handles a specific charset.
Definition codec_8859.c:98
static uint32_t endian_host_to_big(uint32_t host)
Definition endian.h:24
static uint32_t endian_big_to_host(uint32_t big)
Definition endian.h:32
parserutils_error
Definition errors.h:18
@ PARSERUTILS_OK
Definition errors.h:19
@ PARSERUTILS_NEEDDATA
Definition errors.h:25
@ PARSERUTILS_INVALID
Definition errors.h:23
@ PARSERUTILS_NOMEM
Definition errors.h:21
uint16_t parserutils_charset_mibenum_from_name(const char *alias, size_t len)
Retrieve the MIB enum value assigned to an encoding name.
Definition aliases.c:107
ISO-8859-n charset codec.
Definition codec_8859.c:46
uint32_t * table
Mapping table for 0xA0-0xFF.
Definition codec_8859.c:49
uint32_t write_buf[WRITE_BUFSIZE]
Buffer for partial output sequences (encode) (host-endian)
Definition codec_8859.c:58
size_t write_len
Character length of write_buf.
Definition codec_8859.c:61
parserutils_charset_codec base
Base class.
Definition codec_8859.c:47
size_t read_len
Character length of read_buf.
Definition codec_8859.c:55
uint32_t read_buf[READ_BUFSIZE]
Buffer for partial output sequences (decode) (host-endian)
Definition codec_8859.c:52
Core charset codec definition; implementations extend this.
Definition codec_impl.h:19
parserutils_charset_codec_errormode errormode
error mode
Definition codec_impl.h:22
parserutils_error(* encode)(parserutils_charset_codec *codec, const uint8_t **source, size_t *sourcelen, uint8_t **dest, size_t *destlen)
Definition codec_impl.h:26
parserutils_error(* destroy)(parserutils_charset_codec *codec)
Definition codec_impl.h:25
parserutils_error(* decode)(parserutils_charset_codec *codec, const uint8_t **source, size_t *sourcelen, uint8_t **dest, size_t *destlen)
Definition codec_impl.h:29
struct parserutils_charset_codec::@271367034342366162232062053053007137175253257255 handler
Vtable for handler code.
parserutils_error(* reset)(parserutils_charset_codec *codec)
Definition codec_impl.h:32
Codec factory component definition.
Definition codec_impl.h:39
#define UNUSED(x)
Definition utils.h:25
#define SLEN(s)
Definition utils.h:21
#define N_ELEMENTS(s)
Definition utils.h:29