maug
Quick and dirty C mini-augmentation library.
Loading...
Searching...
No Matches
mhtml.h
1
2#ifndef MHTML_H
3#define MHTML_H
4
5#ifndef MHTML_PARSER_TAGS_INIT_SZ
6# define MHTML_PARSER_TAGS_INIT_SZ 10
7#endif /* !MHTML_PARSER_TAGS_INIT_SZ */
8
9#ifdef MHTML_C
10# define MCSS_C
11#endif /* MHTML_C */
12
13#ifndef MHTML_DUMP_LINE_SZ
14# define MHTML_DUMP_LINE_SZ 255
15#endif /* !MHTML_DUMP_LINE_SZ */
16
17#ifndef MHTML_SRC_HREF_SZ_MAX
18# define MHTML_SRC_HREF_SZ_MAX 128
19#endif /* !MHTML_SRC_HREF_SZ_MAX */
20
21#ifndef MHTML_TRACE_LVL
22# define MHTML_TRACE_LVL 0
23#endif /* !MHTML_TRACE_LVL */
24
26#define MHTML_TAG_FLAG_STYLE 0x02
27
28#define MHTML_INPUT_TYPE_BUTTON 0x01
29
30#include <mparser.h>
31#include <mcss.h>
32
33#define MHTML_ATTRIB_TABLE( f ) \
34 f( NONE, 0 ) \
35 f( STYLE, 1 ) \
36 f( CLASS, 2 ) \
37 f( ID, 3 ) \
38 f( NAME, 4 ) \
39 f( SRC, 5 ) \
40 f( TYPE, 6 ) \
41 f( VALUE, 7 )
42
43#define MHTML_TAG_TABLE( f ) \
44 f( 0, NONE, void* none;, NONE ) \
45 f( 1, BODY, void* none;, BLOCK ) \
46 f( 2, DIV, void* none;, BLOCK ) \
47 f( 3, HEAD, void* none;, NONE ) \
48 f( 4, HTML, void* none;, BLOCK ) \
49 f( 5, TEXT, ssize_t content_idx; size_t content_sz;, INLINE ) \
50 f( 6, TITLE, ssize_t content_idx; size_t content_sz;, NONE ) \
51 f( 7, SPAN, void* none;, INLINE ) \
52 f( 8, BR, void* none;, BLOCK ) \
53 f( 9, STYLE, void* none;, NONE ) \
54 f( 10, IMG, char src[MHTML_SRC_HREF_SZ_MAX + 1]; size_t src_sz;, BLOCK ) \
55 f( 11, INPUT, uint8_t input_type; char name[MCSS_ID_SZ_MAX + 1]; size_t name_sz; char value[MCSS_ID_SZ_MAX + 1]; size_t value_sz;, INLINE )
56
57#define MHTML_PARSER_PSTATE_TABLE( f ) \
58 f( MHTML_PSTATE_NONE, 0 ) \
59 f( MHTML_PSTATE_ELEMENT, 1 ) \
60 f( MHTML_PSTATE_ATTRIB_KEY, 2 ) \
61 f( MHTML_PSTATE_ATTRIB_VAL, 3 ) \
62 f( MHTML_PSTATE_END_ELEMENT, 4 ) \
63 f( MHTML_PSTATE_STRING, 5 ) \
64 f( MHTML_PSTATE_STYLE, 6 )
65
66/* TODO: Function names should be verb_noun! */
67
68#if 0
69#define mhtml_tag( parser, idx ) (&((parser)->tags[idx]))
70
71#define mhtml_tag_parent( parser, idx ) \
72 (0 <= (parser)->tags[idx].parent ? \
73 (&((parser)->tags[(parser)->tags[idx].parent]])) : NULL)
74
75#define mhtml_tag_child( parser, idx ) \
76 (0 <= (parser)->tags[idx].first_child ? \
77 (&((parser)->tags[(parser)->tags[idx].first_child]])) : NULL)
78
79#define mhtml_tag_sibling( parser, idx ) \
80 (0 <= (parser)->tags[idx].next_sibling ? \
81 (&((parser)->tags[(parser)->tags[idx].next_sibling]])) : NULL)
82#endif
83
84#define mhtml_parser_pstate( parser ) \
85 mparser_pstate( &((parser)->base) )
86
87#ifdef MPARSER_TRACE_NAMES
88# define mhtml_parser_pstate_push( parser, new_pstate ) \
89 mparser_pstate_push( \
90 "mhtml", &((parser)->base), new_pstate, gc_mhtml_pstate_names )
91
92# define mhtml_parser_pstate_pop( parser ) \
93 mparser_pstate_pop( \
94 "mhtml", &((parser)->base), gc_mhtml_pstate_names )
95#else
96# define mhtml_parser_pstate_push( parser, new_pstate ) \
97 mparser_pstate_push( "mhtml", &((parser)->base), new_pstate )
98
99# define mhtml_parser_pstate_pop( parser ) \
100 mparser_pstate_pop( "mhtml", &((parser)->base) )
101#endif /* MPARSER_TRACE_NAMES */
102
103#define mhtml_parser_invalid_c( parser, c, retval ) \
104 mparser_invalid_c( mhtml, &((parser)->base), c, retval )
105
106#define mhtml_parser_reset_token( parser ) \
107 mparser_reset_token( "mhtml", &((parser)->base) )
108
109#define mhtml_parser_append_token( parser, c ) \
110 mparser_append_token( "mhtml", &((parser)->base), c )
111
112#define mhtml_parser_set_tag_iter( parser, iter ) \
113 debug_printf( MHTML_TRACE_LVL, "setting tag_iter to: " SSIZE_T_FMT \
114 " (previously: " SSIZE_T_FMT ")", (ssize_t)iter, (parser)->tag_iter ); \
115 (parser)->tag_iter = iter;
116
117#define mhtml_parser_is_locked( parser ) (NULL != (parser)->tags)
118
120 uint16_t type;
121 uint8_t flags;
122 ssize_t parent;
123 ssize_t first_child;
124 ssize_t next_sibling;
125 ssize_t style;
126 /* TODO: Use str_stable for classes. */
127 char classes[MCSS_CLASS_SZ_MAX + 1];
128 size_t classes_sz;
129 /* TODO: Use str_stable for id. */
130 char id[MCSS_ID_SZ_MAX + 1];
131 size_t id_sz;
132};
133
134#define MHTML_TAG_TABLE_STRUCT( tag_id, tag_name, fields, disp ) \
135 struct MHTML_TAG_ ## tag_name { \
136 struct MHTML_TAG_BASE base; \
137 fields \
138 };
139
140MHTML_TAG_TABLE( MHTML_TAG_TABLE_STRUCT )
141
142#define MHTML_TAG_TABLE_UNION_FIELD( tag_id, tag_name, fields, disp ) \
143 struct MHTML_TAG_ ## tag_name tag_name;
144
146 struct MHTML_TAG_BASE base; /* Should line up w/ 1st "base" in all types. */
147 MHTML_TAG_TABLE( MHTML_TAG_TABLE_UNION_FIELD )
148};
149
151 struct MPARSER base;
152 uint16_t attrib_key;
153 ssize_t tag_iter;
158 uint8_t tag_flags;
159 struct MCSS_PARSER styler;
160 struct MDATA_STRPOOL strpool;
161 struct MDATA_VECTOR tags;
162 ssize_t body_idx;
163};
164
165MERROR_RETVAL mhtml_parser_free( struct MHTML_PARSER* parser );
166
167MERROR_RETVAL mhtml_pop_tag( struct MHTML_PARSER* parser );
168
169MERROR_RETVAL mhtml_parse_c( struct MHTML_PARSER* parser, char c );
170
171MERROR_RETVAL mhtml_parser_init( struct MHTML_PARSER* parser );
172
173MERROR_RETVAL mhtml_dump_tree(
174 struct MHTML_PARSER* parser, ssize_t iter, size_t d );
175
176#ifdef MHTML_C
177
178#define MHTML_PSTATE_TABLE_CONST( name, idx ) \
179 MAUG_CONST uint8_t SEG_MCONST name = idx;
180
181MHTML_PARSER_PSTATE_TABLE( MHTML_PSTATE_TABLE_CONST )
182
183MPARSER_PSTATE_NAMES( MHTML_PARSER_PSTATE_TABLE, mhtml )
184
185#define MHTML_TAG_TABLE_CONST( tag_id, tag_name, fields, disp ) \
186 MAUG_CONST uint16_t SEG_MCONST MHTML_TAG_TYPE_ ## tag_name = tag_id;
187
188MHTML_TAG_TABLE( MHTML_TAG_TABLE_CONST )
189
190#define MHTML_TAG_TABLE_NAMES( tag_id, tag_name, fields, disp ) \
191 #tag_name,
192
193MAUG_CONST char* SEG_MCONST gc_mhtml_tag_names[] = {
194 MHTML_TAG_TABLE( MHTML_TAG_TABLE_NAMES )
195 ""
196};
197
198#define MHTML_ATTRIB_TABLE_NAME( name, idx ) \
199 #name,
200
201static MAUG_CONST char* SEG_MCONST gc_mhtml_attrib_names[] = {
202 MHTML_ATTRIB_TABLE( MHTML_ATTRIB_TABLE_NAME )
203 ""
204};
205
206#define MHTML_ATTRIB_TABLE_NAME_CONST( attrib_name, attrib_id ) \
207 MAUG_CONST uint16_t SEG_MCONST MHTML_ATTRIB_KEY_ ## attrib_name = attrib_id;
208
209MHTML_ATTRIB_TABLE( MHTML_ATTRIB_TABLE_NAME_CONST )
210
211MERROR_RETVAL mhtml_parser_free( struct MHTML_PARSER* parser ) {
212 MERROR_RETVAL retval = MERROR_OK;
213 union MHTML_TAG* tag_iter = NULL;
214
215 debug_printf( MHTML_TRACE_LVL, "freeing HTML parser..." );
216
217 mdata_strpool_free( &(parser->strpool) );
218
219 mdata_vector_lock( &(parser->tags) );
220
221 while( 0 < mdata_vector_ct( &(parser->tags) ) ) {
222 tag_iter = mdata_vector_get( &(parser->tags), 0, union MHTML_TAG );
223 assert( NULL != tag_iter );
224
225 mdata_vector_unlock( &(parser->tags) );
226 mdata_vector_remove( &(parser->tags), 0 );
227 mdata_vector_lock( &(parser->tags) );
228 }
229
230cleanup:
231
232 mcss_parser_free( &(parser->styler) );
233
234 if( mdata_vector_is_locked( &(parser->tags) ) ) {
235 mdata_vector_unlock( &(parser->tags) );
236 }
237
238 mdata_vector_free( &(parser->tags) );
239
240 return retval;
241}
242
243MERROR_RETVAL mhtml_pop_tag( struct MHTML_PARSER* parser ) {
244 MERROR_RETVAL retval = MERROR_OK;
245 union MHTML_TAG* tag_iter = NULL;
246
247 /* Move up from current tag. */
248 assert( parser->tag_iter >= 0 );
249 mdata_vector_lock( &(parser->tags) );
250 tag_iter = mdata_vector_get(
251 &(parser->tags), parser->tag_iter, union MHTML_TAG );
252 assert( NULL != tag_iter );
253
254 mhtml_parser_set_tag_iter( parser, tag_iter->base.parent );
255
256 if( 0 <= parser->tag_iter ) {
257 debug_printf( MHTML_TRACE_LVL,
258 "moved iter back to tag %s (" SIZE_T_FMT ")",
259 gc_mhtml_tag_names[tag_iter->base.type], parser->tag_iter );
260 } else {
261 debug_printf( MHTML_TRACE_LVL, "moved iter back to root (-1)" );
262 }
263
264cleanup:
265
266 mdata_vector_unlock( &(parser->tags) );
267
268 return retval;
269}
270
271MERROR_RETVAL mhtml_push_tag( struct MHTML_PARSER* parser ) {
272 MERROR_RETVAL retval = MERROR_OK;
273 ssize_t new_tag_idx = -1;
274 ssize_t next_sibling_idx = -1;
275 union MHTML_TAG tag_new;
276 union MHTML_TAG* p_tag_new = NULL;
277 union MHTML_TAG* p_tag_iter = NULL;
278
279 maug_mzero( &tag_new, sizeof( union MHTML_TAG ) );
280 tag_new.base.parent = -1;
281 tag_new.base.first_child = -1;
282 tag_new.base.next_sibling = -1;
283 tag_new.base.style = -1;
284 tag_new.base.flags = parser->tag_flags;
285 parser->tag_flags = 0;
286
287 new_tag_idx = mdata_vector_append(
288 &(parser->tags), &tag_new, sizeof( union MHTML_TAG ) );
289 if( 0 > new_tag_idx ) {
290 retval = mdata_retval( new_tag_idx );
291 goto cleanup;
292 }
293
294 mdata_vector_lock( &(parser->tags) );
295 p_tag_new = mdata_vector_get(
296 &(parser->tags), new_tag_idx, union MHTML_TAG );
297 assert( NULL != p_tag_new );
298
299 if( 0 > parser->tag_iter ) {
300 mhtml_parser_set_tag_iter( parser, new_tag_idx );
301 goto cleanup;
302 }
303
304 /* Get the current tag_iter. */
305 p_tag_iter = mdata_vector_get(
306 &(parser->tags), parser->tag_iter, union MHTML_TAG );
307 assert( NULL != p_tag_iter );
308
309 /* Set new tag parent to current tag. */
310 p_tag_new->base.parent = parser->tag_iter;
311
312 /* Add new tag to current tag's children. */
313 if( 0 > p_tag_iter->base.first_child ) {
314 debug_printf( MHTML_TRACE_LVL,
315 "zxzx attached " SSIZE_T_FMT " as first child to "
316 SSIZE_T_FMT, new_tag_idx, parser->tag_iter );
317 p_tag_iter->base.first_child = new_tag_idx;
318 } else {
319 /* Find the last sibling child. */
320 next_sibling_idx = p_tag_iter->base.first_child;
321 p_tag_iter = mdata_vector_get(
322 &(parser->tags), next_sibling_idx, union MHTML_TAG );
323 while( NULL != p_tag_iter && 0 <= p_tag_iter->base.next_sibling ) {
324 next_sibling_idx = p_tag_iter->base.next_sibling;
325 p_tag_iter = mdata_vector_get(
326 &(parser->tags), next_sibling_idx, union MHTML_TAG );
327 }
328 assert( NULL != p_tag_iter );
329 p_tag_iter->base.next_sibling = new_tag_idx;
330 debug_printf( MHTML_TRACE_LVL,
331 "attached " SSIZE_T_FMT " as next sibling to "
332 SSIZE_T_FMT, new_tag_idx, next_sibling_idx );
333 }
334
335 debug_printf( MHTML_TRACE_LVL,
336 "pushed new tag " SSIZE_T_FMT " under " SSIZE_T_FMT,
337 new_tag_idx, p_tag_new->base.parent );
338
339 mhtml_parser_set_tag_iter( parser, new_tag_idx );
340
341cleanup:
342
343 mdata_vector_unlock( &(parser->tags) );
344
345 return retval;
346}
347
348MERROR_RETVAL mhtml_push_element_tag( struct MHTML_PARSER* parser ) {
349 MERROR_RETVAL retval = MERROR_OK;
350 size_t i = 0;
351 union MHTML_TAG* p_tag_iter = NULL;
352
353 mparser_token_upper( &((parser)->base), i );
354
355 if( 0 == strncmp( "STYLE", parser->base.token, 6 ) ) {
356 /* Special case: style tag. Don't push a new tag here, but set a flag for
357 * the text tag next created by mhtml_push_tag() so the contents are
358 * directly attached to the style tag.
359 */
360 parser->tag_flags |= MHTML_TAG_FLAG_STYLE;
361 goto cleanup;
362 }
363
364 retval = mhtml_push_tag( parser );
365 maug_cleanup_if_not_ok();
366
367 mdata_vector_lock( &(parser->tags) );
368
369 p_tag_iter = mdata_vector_get(
370 &(parser->tags), parser->tag_iter, union MHTML_TAG );
371 assert( NULL != p_tag_iter );
372
373 /* Figure out tag type. */
374 i = 0;
375 while( '\0' != gc_mhtml_tag_names[i][0] ) {
376 if(
377 parser->base.token_sz == maug_strlen( gc_mhtml_tag_names[i] ) &&
378 0 == strncmp(
379 gc_mhtml_tag_names[i], parser->base.token, parser->base.token_sz )
380 ) {
381 debug_printf( MHTML_TRACE_LVL,
382 "new tag (" SSIZE_T_FMT ") type: %s",
383 parser->tag_iter, gc_mhtml_tag_names[i] );
384 p_tag_iter->base.type = i;
385
386 if( MHTML_TAG_TYPE_BODY == i ) {
387 /* Special case: body tag. Keep track of it for later so it can
388 * be passed to the renderer.
389 */
390 assert( -1 == parser->body_idx );
391 parser->body_idx = parser->tag_iter;
392 debug_printf( MHTML_TRACE_LVL,
393 "set body index to: " SSIZE_T_FMT,
394 parser->body_idx );
395 }
396
397 goto cleanup;
398 }
399 i++;
400 }
401
402 error_printf( "could not find type for new tag (" SSIZE_T_FMT ")",
403 parser->tag_iter );
404
405cleanup:
406
407 if( mdata_vector_is_locked( &(parser->tags) ) ) {
408 mdata_vector_unlock( &(parser->tags) );
409 }
410
411 return retval;
412}
413
414MERROR_RETVAL mhtml_push_text_tag( struct MHTML_PARSER* parser ) {
415 MERROR_RETVAL retval = MERROR_OK;
416 size_t i = 0;
417 union MHTML_TAG* p_tag_iter = NULL;
418
419 retval = mhtml_push_tag( parser );
420 maug_cleanup_if_not_ok();
421
422 mdata_vector_lock( &(parser->tags) );
423
424 p_tag_iter = mdata_vector_get(
425 &(parser->tags), parser->tag_iter, union MHTML_TAG );
426 assert( NULL != p_tag_iter );
427
428 if(
429 MHTML_TAG_FLAG_STYLE == (MHTML_TAG_FLAG_STYLE &
430 p_tag_iter->base.flags)
431 ) {
432 p_tag_iter->base.type = MHTML_TAG_TYPE_STYLE;
433 } else {
434 p_tag_iter->base.type = MHTML_TAG_TYPE_TEXT;
435 }
436
437 if( MHTML_TAG_TYPE_STYLE == p_tag_iter->base.type ) {
438 /* TODO: If it's the last character and there's still a token, process it! */
439 debug_printf( MHTML_TRACE_LVL, "parsing STYLE tag..." );
440 for( ; parser->base.token_sz > i ; i++ ) {
441 retval = mcss_parse_c( &(parser->styler), parser->base.token[i] );
442 maug_cleanup_if_not_ok();
443 }
444 debug_printf( 1, "out of style characters..." );
445 mcss_parser_flush( &(parser->styler) );
446 mcss_parser_reset( &(parser->styler) );
447 } else {
448 /* Eliminate trailing spaces. */
449 while( ' ' == parser->base.token[parser->base.token_sz - 1] ) {
450 parser->base.token_sz--;
451 }
452
453 /* Copy token to tag text. */
454 p_tag_iter->TEXT.content_idx = mdata_strpool_append(
455 &(parser->strpool), parser->base.token, parser->base.token_sz );
456 p_tag_iter->TEXT.content_sz = parser->base.token_sz;
457 }
458
459 debug_printf( 1, "done processing tag contents..." );
460
461cleanup:
462
463 if( mdata_vector_is_locked( &(parser->tags) ) ) {
464 mdata_vector_unlock( &(parser->tags) );
465 }
466
467 return retval;
468}
469
470MERROR_RETVAL mhtml_push_attrib_key( struct MHTML_PARSER* parser ) {
471 MERROR_RETVAL retval = MERROR_OK;
472 size_t i = 0;
473
474 debug_printf( MHTML_TRACE_LVL, "attrib: %s", parser->base.token );
475
476 mparser_token_upper( &((parser)->base), i );
477
478 /* Figure out attrib type. */
479 i = 0;
480 while( '\0' != gc_mhtml_attrib_names[i][0] ) {
481 if(
482 parser->base.token_sz == maug_strlen( gc_mhtml_attrib_names[i] ) &&
483 0 == strncmp(
484 gc_mhtml_attrib_names[i], parser->base.token, parser->base.token_sz )
485 ) {
486 debug_printf(
487 MHTML_TRACE_LVL, "new attrib type: %s", gc_mhtml_attrib_names[i] );
488 parser->attrib_key = i;
489 goto cleanup;
490 }
491 i++;
492 }
493
494 error_printf( "unknown attrib: %s", parser->base.token );
495
496cleanup:
497
498 return retval;
499}
500
501static MERROR_RETVAL _mhtml_set_attrib_val( struct MHTML_PARSER* parser ) {
502 MERROR_RETVAL retval = MERROR_OK;
503 size_t i = 0;
504 union MHTML_TAG* p_tag_iter = NULL;
505
506 mdata_vector_lock( &(parser->tags) );
507
508 p_tag_iter = mdata_vector_get(
509 &(parser->tags), parser->tag_iter, union MHTML_TAG );
510 assert( NULL != p_tag_iter );
511
512 if( MHTML_ATTRIB_KEY_STYLE == parser->attrib_key ) {
513 debug_printf( MHTML_TRACE_LVL, "style: %s", parser->base.token );
514 /* TODO: Parse and attach style. */
515
516 /* Create an empty new style. */
517 mdata_vector_unlock( &(parser->tags) );
518 retval = mcss_push_style( &(parser->styler), MCSS_SELECT_NONE, NULL, 0 );
519 maug_cleanup_if_not_ok();
520 mdata_vector_lock( &(parser->tags) );
521
522 /* Set the new style as this tag's explicit style. */
523 p_tag_iter->base.style =
524 mdata_vector_ct( &(parser->styler.styles) ) - 1;
525
526 for( ; parser->base.token_sz > i ; i++ ) {
527 retval = mcss_parse_c( &(parser->styler), parser->base.token[i] );
528 maug_cleanup_if_not_ok();
529 }
530
531 debug_printf( 1, "out of style characters..." );
532 mcss_parser_flush( &(parser->styler) );
533
534 goto cleanup;
535
536 } else if( MHTML_ATTRIB_KEY_CLASS == parser->attrib_key ) {
537 maug_strncpy(
538 p_tag_iter->base.classes,
539 parser->base.token,
540 MCSS_CLASS_SZ_MAX );
541 p_tag_iter->base.classes_sz = parser->base.token_sz;
542
543 } else if( MHTML_ATTRIB_KEY_ID == parser->attrib_key ) {
544 maug_strncpy(
545 p_tag_iter->base.id,
546 parser->base.token,
547 MCSS_ID_SZ_MAX );
548 p_tag_iter->base.id_sz = parser->base.token_sz;
549
550 } else if( MHTML_ATTRIB_KEY_SRC == parser->attrib_key ) {
551 /* TODO: Validate tag type. */
552 maug_strncpy(
553 p_tag_iter->IMG.src,
554 parser->base.token,
555 MHTML_SRC_HREF_SZ_MAX );
556 p_tag_iter->IMG.src_sz = parser->base.token_sz;
557
558 } else if( MHTML_ATTRIB_KEY_TYPE == parser->attrib_key ) {
559 /* TODO: Validate tag type. */
560
561 if( 0 == maug_strncpy( parser->base.token, "button", 7 ) ) {
562 p_tag_iter->INPUT.input_type =
563 MHTML_INPUT_TYPE_BUTTON;
564 }
565
566 } else if( MHTML_ATTRIB_KEY_NAME == parser->attrib_key ) {
567 /* TODO: Validate tag type. */
568 maug_strncpy(
569 p_tag_iter->INPUT.name,
570 parser->base.token,
571 MCSS_ID_SZ_MAX );
572 p_tag_iter->INPUT.name_sz = parser->base.token_sz;
573
574 } else if( MHTML_ATTRIB_KEY_VALUE == parser->attrib_key ) {
575 /* TODO: Validate tag type. */
576 maug_strncpy(
577 p_tag_iter->INPUT.value,
578 parser->base.token,
579 MCSS_ID_SZ_MAX );
580 p_tag_iter->INPUT.value_sz = parser->base.token_sz;
581 }
582
583cleanup:
584
585 if( mdata_vector_is_locked( &(parser->tags) ) ) {
586 mdata_vector_unlock( &(parser->tags) );
587 }
588
589 return retval;
590}
591
592MERROR_RETVAL mhtml_parse_c( struct MHTML_PARSER* parser, char c ) {
593 MERROR_RETVAL retval = MERROR_OK;
594 union MHTML_TAG* p_tag_iter = NULL;
595 size_t tag_iter_type = 0;
596
597 switch( c ) {
598 case '<':
599 if( MHTML_PSTATE_NONE == mhtml_parser_pstate( parser ) ) {
600 if( 0 < parser->base.token_sz ) {
601 retval = mhtml_push_text_tag( parser );
602 maug_cleanup_if_not_ok();
603
604 /* Grab the current tag to check its type below. */
605 mdata_vector_lock( &(parser->tags) );
606 p_tag_iter = mdata_vector_get(
607 &(parser->tags), parser->tag_iter, union MHTML_TAG );
608 assert( NULL != p_tag_iter );
609 tag_iter_type = p_tag_iter->base.type;
610 mdata_vector_unlock( &(parser->tags) );
611
612 if(
613 /* See special exception in mhtml_push_tag(). Style tags don't
614 * push their subordinate text, so popping here would be
615 * uneven!
616 */
617 MHTML_TAG_TYPE_STYLE != tag_iter_type
618 ) {
619 /* Pop out of text so next tag isn't a child of it. */
620 retval = mhtml_pop_tag( parser );
621 maug_cleanup_if_not_ok();
622 }
623 }
624 retval = mhtml_parser_pstate_push( parser, MHTML_PSTATE_ELEMENT );
625 maug_cleanup_if_not_ok();
626 mhtml_parser_reset_token( parser );
627
628 } else {
629 mhtml_parser_invalid_c( parser, c, retval );
630 }
631 break;
632
633 case '>':
634 if( MHTML_PSTATE_ELEMENT == mhtml_parser_pstate( parser ) ) {
635 retval = mhtml_push_element_tag( parser );
636 maug_cleanup_if_not_ok();
637 mhtml_parser_pstate_pop( parser );
638 mhtml_parser_reset_token( parser );
639
640 } else if( MHTML_PSTATE_ATTRIB_KEY == mhtml_parser_pstate( parser ) ) {
641 mhtml_parser_pstate_pop( parser );
642 assert( MHTML_PSTATE_ELEMENT == mhtml_parser_pstate( parser ) );
643 mhtml_parser_pstate_pop( parser ); /* Pop element. */
644 mhtml_parser_reset_token( parser );
645
646 } else if( MHTML_PSTATE_END_ELEMENT == mhtml_parser_pstate( parser ) ) {
647
648 retval = mhtml_pop_tag( parser );
649 maug_cleanup_if_not_ok();
650
651 mhtml_parser_pstate_pop( parser );
652 if( MHTML_PSTATE_ATTRIB_KEY == mhtml_parser_pstate( parser ) ) {
653 mhtml_parser_pstate_pop( parser );
654 }
655 assert( MHTML_PSTATE_ELEMENT == mhtml_parser_pstate( parser ) );
656 mhtml_parser_pstate_pop( parser ); /* Pop element. */
657 mhtml_parser_reset_token( parser );
658
659 } else if( MHTML_PSTATE_STRING == mhtml_parser_pstate( parser ) ) {
660 retval = mhtml_parser_append_token( parser, c );
661 maug_cleanup_if_not_ok();
662
663 } else if( MHTML_PSTATE_NONE == mhtml_parser_pstate( parser ) ) {
664 retval = mhtml_parser_append_token( parser, c );
665 maug_cleanup_if_not_ok();
666
667 } else {
668 mhtml_parser_invalid_c( parser, c, retval );
669 }
670 break;
671
672 case '/':
673 if(
674 MHTML_PSTATE_ELEMENT == mhtml_parser_pstate( parser ) &&
675 0 == parser->base.token_sz
676 ) {
677 /* Start of a close tag. */
678 retval = mhtml_parser_pstate_push( parser, MHTML_PSTATE_END_ELEMENT );
679 maug_cleanup_if_not_ok();
680
681 } else if( MHTML_PSTATE_ATTRIB_KEY == mhtml_parser_pstate( parser ) ) {
682 /* Close of a self-closing tag. */
683 retval = mhtml_parser_pstate_push( parser, MHTML_PSTATE_END_ELEMENT );
684 maug_cleanup_if_not_ok();
685
686 } else if( MHTML_PSTATE_STRING == mhtml_parser_pstate( parser ) ) {
687 retval = mhtml_parser_append_token( parser, c );
688 maug_cleanup_if_not_ok();
689
690 } else if( MHTML_PSTATE_NONE == mhtml_parser_pstate( parser ) ) {
691 retval = mhtml_parser_append_token( parser, c );
692 maug_cleanup_if_not_ok();
693
694 } else {
695 mhtml_parser_invalid_c( parser, c, retval );
696 }
697 break;
698
699 case '=':
700 if( MHTML_PSTATE_ATTRIB_KEY == mhtml_parser_pstate( parser ) ) {
701 retval = mhtml_push_attrib_key( parser );
702 maug_cleanup_if_not_ok();
703 retval = mhtml_parser_pstate_push( parser, MHTML_PSTATE_ATTRIB_VAL );
704 maug_cleanup_if_not_ok();
705 mhtml_parser_reset_token( parser );
706
707 } else if( MHTML_PSTATE_ATTRIB_VAL == mhtml_parser_pstate( parser ) ) {
708 retval = mhtml_parser_append_token( parser, c );
709 maug_cleanup_if_not_ok();
710
711 } else if( MHTML_PSTATE_NONE == mhtml_parser_pstate( parser ) ) {
712 retval = mhtml_parser_append_token( parser, c );
713 maug_cleanup_if_not_ok();
714
715 } else {
716 mhtml_parser_invalid_c( parser, '_', retval );
717 }
718 break;
719
720 case '"':
721 if( MHTML_PSTATE_ATTRIB_VAL == mhtml_parser_pstate( parser ) ) {
722 retval = mhtml_parser_pstate_push( parser, MHTML_PSTATE_STRING );
723 maug_cleanup_if_not_ok();
724 mhtml_parser_reset_token( parser );
725
726 } else if( MHTML_PSTATE_STRING == mhtml_parser_pstate( parser ) ) {
727 retval = _mhtml_set_attrib_val( parser );
728 maug_cleanup_if_not_ok();
729 mhtml_parser_pstate_pop( parser );
730 assert( MHTML_PSTATE_ATTRIB_VAL == mhtml_parser_pstate( parser ) );
731 mhtml_parser_pstate_pop( parser );
732 mhtml_parser_reset_token( parser );
733
734 } else if( MHTML_PSTATE_NONE == mhtml_parser_pstate( parser ) ) {
735 retval = mhtml_parser_append_token( parser, c );
736 maug_cleanup_if_not_ok();
737
738 } else {
739 mhtml_parser_invalid_c( parser, '_', retval );
740 }
741 break;
742
743 case '\r':
744 case '\n':
745 case '\t':
746 break;
747
748 case ' ':
749 if( MHTML_PSTATE_ELEMENT == mhtml_parser_pstate( parser ) ) {
750 retval = mhtml_push_element_tag( parser );
751 maug_cleanup_if_not_ok();
752 retval = mhtml_parser_pstate_push( parser, MHTML_PSTATE_ATTRIB_KEY );
753 maug_cleanup_if_not_ok();
754 mhtml_parser_reset_token( parser );
755
756 } else if( MHTML_PSTATE_STRING == mhtml_parser_pstate( parser ) ) {
757 retval = mhtml_parser_append_token( parser, c );
758 maug_cleanup_if_not_ok();
759
760 } else if( MHTML_PSTATE_ATTRIB_KEY == mhtml_parser_pstate( parser ) ) {
761 /* Do nothing. */
762
763 } else if( MHTML_PSTATE_NONE == mhtml_parser_pstate( parser ) ) {
764 /* Avoid a token that's only whitespace. */
765 if(
766 0 < parser->base.token_sz &&
767 ' ' != parser->base.token[parser->base.token_sz - 1]
768 ) {
769 retval = mhtml_parser_append_token( parser, ' ' );
770 maug_cleanup_if_not_ok();
771 }
772
773 } else {
774 mhtml_parser_invalid_c( parser, '_', retval );
775 }
776 break;
777
778 default:
779 retval = mhtml_parser_append_token( parser, c );
780 maug_cleanup_if_not_ok();
781 break;
782 }
783
784 parser->base.i++;
785
786 mparser_wait( &((parser)->base) );
787
788cleanup:
789
790 parser->base.last_c = c;
791
792 if( mdata_vector_is_locked( &(parser->tags) ) ) {
793 mdata_vector_unlock( &(parser->tags) );
794 }
795
796 return retval;
797}
798
799MERROR_RETVAL mhtml_parser_init( struct MHTML_PARSER* parser ) {
800 MERROR_RETVAL retval = MERROR_OK;
801
802 /* Perform initial tag allocation. */
803 mhtml_parser_set_tag_iter( parser, -1 );
804 parser->body_idx = -1;
805
806 retval = mcss_parser_init( &(parser->styler) );
807 maug_cleanup_if_not_ok();
808
809cleanup:
810
811 return retval;
812}
813
814MERROR_RETVAL mhtml_dump_tree(
815 struct MHTML_PARSER* parser, ssize_t iter, size_t d
816) {
817 size_t i = 0;
818 char* strpool = NULL;
819 char dump_line[MHTML_DUMP_LINE_SZ + 1];
820 union MHTML_TAG* p_tag_iter = NULL;
821 ssize_t first_child = -1;
822 ssize_t next_sibling = -1;
823 MERROR_RETVAL retval = MERROR_OK;
824
825 if( 0 > iter ) {
826 return retval;
827 }
828
829 mdata_vector_lock( &(parser->tags) );
830
831 p_tag_iter = mdata_vector_get( &(parser->tags), iter, union MHTML_TAG );
832 assert( NULL != p_tag_iter );
833
834 maug_mzero( dump_line, MHTML_DUMP_LINE_SZ + 1 );
835
836 for( i = 0 ; d > i ; i++ ) {
837 assert( i < MHTML_DUMP_LINE_SZ );
838 strcat( dump_line, " " );
839 }
840 if( MHTML_TAG_TYPE_TEXT == p_tag_iter->base.type ) {
841 if( -1 == p_tag_iter->TEXT.content_idx ) {
842 error_printf( "no tag content present!" );
843 goto cleanup;
844 }
845
846 mdata_strpool_lock( &(parser->strpool), strpool );
847
848 if(
849 maug_strlen( dump_line ) + 7 /* ("TEXT: \n") */
850 + p_tag_iter->TEXT.content_sz < MHTML_DUMP_LINE_SZ
851 ) {
852 strcat( dump_line, "TEXT: " );
853 strcat( dump_line, &(strpool[p_tag_iter->TEXT.content_idx]) );
854 strcat( dump_line, "\n" );
855 }
856
857 mdata_strpool_unlock( &(parser->strpool), strpool );
858
859 } else {
860 if(
861 maug_strlen( dump_line ) +
862 maug_strlen( gc_mhtml_tag_names[p_tag_iter->base.type] ) <
863 MHTML_DUMP_LINE_SZ
864 ) {
865 strcat( dump_line,
866 gc_mhtml_tag_names[p_tag_iter->base.type] );
867 }
868
869 if(
870 0 <= p_tag_iter->base.style &&
871 maug_strlen( dump_line ) + 9 /* (styled) */ < MHTML_DUMP_LINE_SZ
872 ) {
873 strcat( dump_line, " (styled)" );
874 }
875
876 if(
877 0 < p_tag_iter->base.id_sz &&
878 maug_strlen( dump_line ) + 7 /* (id: ) */
879 + maug_strlen( p_tag_iter->base.id ) < MHTML_DUMP_LINE_SZ
880 ) {
881 maug_snprintf( &(dump_line[maug_strlen( dump_line )]),
882 MHTML_DUMP_LINE_SZ - maug_strlen( dump_line ),
883 " (id: %s)", p_tag_iter->base.id );
884 }
885
886 if(
887 0 < p_tag_iter->base.classes_sz &&
888 maug_strlen( dump_line ) + 12 /* (classes: ) */
889 + maug_strlen( p_tag_iter->base.id ) < MHTML_DUMP_LINE_SZ
890 ) {
891 maug_snprintf( &(dump_line[maug_strlen( dump_line )]),
892 MHTML_DUMP_LINE_SZ - maug_strlen( dump_line ),
893 " (classes: %s)", p_tag_iter->base.classes );
894 }
895
896 if(
897 MHTML_TAG_TYPE_IMG == p_tag_iter->base.type &&
898 0 < p_tag_iter->IMG.src_sz &&
899 maug_strlen( dump_line ) + 8 /* (src: ) */
900 + maug_strlen( p_tag_iter->IMG.src ) < MHTML_DUMP_LINE_SZ
901 ) {
902 maug_snprintf( &(dump_line[maug_strlen( dump_line )]),
903 MHTML_DUMP_LINE_SZ - maug_strlen( dump_line ),
904 " (src: %s)", p_tag_iter->IMG.src );
905 }
906
907 if(
908 MHTML_TAG_TYPE_INPUT == p_tag_iter->base.type &&
909 0 < p_tag_iter->INPUT.value_sz &&
910 maug_strlen( dump_line ) + 10 /* (value: ) */
911 + maug_strlen( p_tag_iter->INPUT.value ) < MHTML_DUMP_LINE_SZ
912 ) {
913 maug_snprintf( &(dump_line[maug_strlen( dump_line )]),
914 MHTML_DUMP_LINE_SZ - maug_strlen( dump_line ),
915 " (value: %s)", p_tag_iter->INPUT.value );
916 }
917
918 }
919
920 debug_printf( 1, "%s", dump_line );
921
922 first_child = p_tag_iter->base.first_child;
923 next_sibling = p_tag_iter->base.next_sibling;
924
925 mdata_vector_unlock( &(parser->tags) );
926
927 retval = mhtml_dump_tree( parser, first_child, d + 1 );
928 maug_cleanup_if_not_ok();
929
930 retval = mhtml_dump_tree( parser, next_sibling, d );
931 maug_cleanup_if_not_ok();
932
933cleanup:
934
935 if( mdata_vector_is_locked( &(parser->tags) ) ) {
936 mdata_vector_unlock( &(parser->tags) );
937 }
938
939 return retval;
940}
941
942#else
943
944#define MHTML_TAG_TABLE_CONST( tag_id, tag_name, fields, disp ) \
945 extern MAUG_CONST uint16_t SEG_MCONST MHTML_TAG_TYPE_ ## tag_name;
946
947MHTML_TAG_TABLE( MHTML_TAG_TABLE_CONST )
948
949extern MAUG_CONST char* SEG_MCONST gc_mhtml_tag_names[];
950
951#endif /* MHTML_C */
952
953#endif /* !MHTML_H */
954
int MERROR_RETVAL
Return type indicating function returns a value from this list.
Definition merror.h:19
#define maug_mzero(ptr, sz)
Zero the block of memory pointed to by ptr.
Definition mmem.h:62
Definition mhtml.h:150
uint8_t tag_flags
Flags to be pushed to MHTML_TAG_BASE::flags on next mhtml_push_tag().
Definition mhtml.h:158
Definition mhtml.h:119
Definition mhtml.h:145