| /* |
| * Copyright (c) 2003, 2004 X/IO Labs, xiolabs.com. |
| * Copyright (c) 2003, 2004, 2005 Lev Walkin <vlm@lionet.info>. |
| * All rights reserved. |
| * Redistribution and modifications are permitted subject to BSD license. |
| */ |
| #include <asn_system.h> |
| #include <xer_support.h> |
| |
| /* Parser states */ |
| typedef enum { |
| ST_TEXT, |
| ST_TAG_START, |
| ST_TAG_BODY, |
| ST_TAG_QUOTE_WAIT, |
| ST_TAG_QUOTED_STRING, |
| ST_TAG_UNQUOTED_STRING, |
| ST_COMMENT_WAIT_DASH1, /* "<!--"[1] */ |
| ST_COMMENT_WAIT_DASH2, /* "<!--"[2] */ |
| ST_COMMENT, |
| ST_COMMENT_CLO_DASH2, /* "-->"[0] */ |
| ST_COMMENT_CLO_RT /* "-->"[1] */ |
| } pstate_e; |
| |
| static const int |
| _charclass[256] = { |
| 0,0,0,0,0,0,0,0, 0,1,1,0,1,1,0,0, |
| 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, |
| 1,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, |
| 2,2,2,2,2,2,2,2, 2,2,0,0,0,0,0,0, /* 01234567 89 */ |
| 0,3,3,3,3,3,3,3, 3,3,3,3,3,3,3,3, /* ABCDEFG HIJKLMNO */ |
| 3,3,3,3,3,3,3,3, 3,3,3,0,0,0,0,0, /* PQRSTUVW XYZ */ |
| 0,3,3,3,3,3,3,3, 3,3,3,3,3,3,3,3, /* abcdefg hijklmno */ |
| 3,3,3,3,3,3,3,3, 3,3,3,0,0,0,0,0 /* pqrstuvw xyz */ |
| }; |
| #define WHITESPACE(c) (_charclass[(unsigned char)(c)] == 1) |
| #define ALNUM(c) (_charclass[(unsigned char)(c)] >= 2) |
| #define ALPHA(c) (_charclass[(unsigned char)(c)] == 3) |
| |
| /* Aliases for characters, ASCII/UTF-8 */ |
| #define EXCLAM 0x21 /* '!' */ |
| #define CQUOTE 0x22 /* '"' */ |
| #define CDASH 0x2d /* '-' */ |
| #define CSLASH 0x2f /* '/' */ |
| #define LANGLE 0x3c /* '<' */ |
| #define CEQUAL 0x3d /* '=' */ |
| #define RANGLE 0x3e /* '>' */ |
| #define CQUEST 0x3f /* '?' */ |
| |
| /* Invoke token callback */ |
| #define TOKEN_CB_CALL(type, _ns, _current_too, _final) do { \ |
| int _ret; \ |
| pstate_e ns = _ns; \ |
| ssize_t _sz = (p - chunk_start) + _current_too; \ |
| if (!_sz) { \ |
| /* Shortcut */ \ |
| state = _ns; \ |
| break; \ |
| } \ |
| _ret = cb(type, chunk_start, _sz, key); \ |
| if(_ret < _sz) { \ |
| if(_current_too && _ret == -1) \ |
| state = ns; \ |
| goto finish; \ |
| } \ |
| chunk_start = p + _current_too; \ |
| state = ns; \ |
| } while(0) |
| |
| #define TOKEN_CB(_type, _ns, _current_too) \ |
| TOKEN_CB_CALL(_type, _ns, _current_too, 0) |
| |
| #define PXML_TAG_FINAL_CHUNK_TYPE PXML_TAG_END |
| #define PXML_COMMENT_FINAL_CHUNK_TYPE PXML_COMMENT_END |
| |
| #define TOKEN_CB_FINAL(_type, _ns, _current_too) \ |
| TOKEN_CB_CALL( _type ## _FINAL_CHUNK_TYPE , _ns, _current_too, 1) |
| |
| /* |
| * Parser itself |
| */ |
| ssize_t pxml_parse(int *stateContext, const void *xmlbuf, size_t size, pxml_callback_f *cb, void *key) { |
| pstate_e state = (pstate_e)*stateContext; |
| const char *chunk_start = (const char *)xmlbuf; |
| const char *p = chunk_start; |
| const char *end = p + size; |
| |
| for(; p < end; p++) { |
| int C = *(const unsigned char *)p; |
| switch(state) { |
| case ST_TEXT: |
| /* |
| * Initial state: we're in the middle of some text, |
| * or just have started. |
| */ |
| if (C == LANGLE) |
| /* We're now in the tag, probably */ |
| TOKEN_CB(PXML_TEXT, ST_TAG_START, 0); |
| break; |
| case ST_TAG_START: |
| if (ALPHA(C) || (C == CSLASH)) |
| state = ST_TAG_BODY; |
| else if (C == EXCLAM) |
| state = ST_COMMENT_WAIT_DASH1; |
| else |
| /* |
| * Not characters and not whitespace. |
| * Must be something like "3 < 4". |
| */ |
| TOKEN_CB(PXML_TEXT, ST_TEXT, 1);/* Flush as data */ |
| break; |
| case ST_TAG_BODY: |
| switch(C) { |
| case RANGLE: |
| /* End of the tag */ |
| TOKEN_CB_FINAL(PXML_TAG, ST_TEXT, 1); |
| break; |
| case LANGLE: |
| /* |
| * The previous tag wasn't completed, but still |
| * recognized as valid. (Mozilla-compatible) |
| */ |
| TOKEN_CB_FINAL(PXML_TAG, ST_TAG_START, 0); |
| break; |
| case CEQUAL: |
| state = ST_TAG_QUOTE_WAIT; |
| break; |
| } |
| break; |
| case ST_TAG_QUOTE_WAIT: |
| /* |
| * State after the equal sign ("=") in the tag. |
| */ |
| switch(C) { |
| case CQUOTE: |
| state = ST_TAG_QUOTED_STRING; |
| break; |
| case RANGLE: |
| /* End of the tag */ |
| TOKEN_CB_FINAL(PXML_TAG, ST_TEXT, 1); |
| break; |
| default: |
| if(!WHITESPACE(C)) |
| /* Unquoted string value */ |
| state = ST_TAG_UNQUOTED_STRING; |
| } |
| break; |
| case ST_TAG_QUOTED_STRING: |
| /* |
| * Tag attribute's string value in quotes. |
| */ |
| if(C == CQUOTE) { |
| /* Return back to the tag state */ |
| state = ST_TAG_BODY; |
| } |
| break; |
| case ST_TAG_UNQUOTED_STRING: |
| if(C == RANGLE) { |
| /* End of the tag */ |
| TOKEN_CB_FINAL(PXML_TAG, ST_TEXT, 1); |
| } else if(WHITESPACE(C)) { |
| /* Return back to the tag state */ |
| state = ST_TAG_BODY; |
| } |
| break; |
| case ST_COMMENT_WAIT_DASH1: |
| if(C == CDASH) { |
| state = ST_COMMENT_WAIT_DASH2; |
| } else { |
| /* Some ordinary tag. */ |
| state = ST_TAG_BODY; |
| } |
| break; |
| case ST_COMMENT_WAIT_DASH2: |
| if(C == CDASH) { |
| /* Seen "<--" */ |
| state = ST_COMMENT; |
| } else { |
| /* Some ordinary tag */ |
| state = ST_TAG_BODY; |
| } |
| break; |
| case ST_COMMENT: |
| if(C == CDASH) { |
| state = ST_COMMENT_CLO_DASH2; |
| } |
| break; |
| case ST_COMMENT_CLO_DASH2: |
| if(C == CDASH) { |
| state = ST_COMMENT_CLO_RT; |
| } else { |
| /* This is not an end of a comment */ |
| state = ST_COMMENT; |
| } |
| break; |
| case ST_COMMENT_CLO_RT: |
| if(C == RANGLE) { |
| TOKEN_CB_FINAL(PXML_COMMENT, ST_TEXT, 1); |
| } else if(C == CDASH) { |
| /* Maintain current state, still waiting for '>' */ |
| } else { |
| state = ST_COMMENT; |
| } |
| break; |
| } /* switch(*ptr) */ |
| } /* for() */ |
| |
| /* |
| * Flush the partially processed chunk, state permitting. |
| */ |
| if(p - chunk_start) { |
| switch (state) { |
| case ST_COMMENT: |
| TOKEN_CB(PXML_COMMENT, state, 0); |
| break; |
| case ST_TEXT: |
| TOKEN_CB(PXML_TEXT, state, 0); |
| break; |
| default: break; /* a no-op */ |
| } |
| } |
| |
| finish: |
| *stateContext = (int)state; |
| return chunk_start - (const char *)xmlbuf; |
| } |
| |