Add a lexer for JSON
Our JSON parser is a three stage parser. The first stage tokenizes the stream into a set of lexical tokens. Since the lexical grammar is regular, we can use a finite state machine to model it. The state machine will emit tokens as they are identified. Signed-off-by: Anthony Liguori <aliguori@us.ibm.com>
This commit is contained in:
		
							parent
							
								
									f7e6b1927f
								
							
						
					
					
						commit
						5ab8558d9b
					
				
							
								
								
									
										2
									
								
								Makefile
								
								
								
								
							
							
						
						
									
										2
									
								
								Makefile
								
								
								
								
							| 
						 | 
				
			
			@ -137,7 +137,7 @@ obj-y += buffered_file.o migration.o migration-tcp.o qemu-sockets.o
 | 
			
		|||
obj-y += qemu-char.o aio.o savevm.o
 | 
			
		||||
obj-y += msmouse.o ps2.o
 | 
			
		||||
obj-y += qdev.o qdev-properties.o
 | 
			
		||||
obj-y += qint.o qstring.o qdict.o qlist.o qfloat.o qbool.o
 | 
			
		||||
obj-y += qint.o qstring.o qdict.o qlist.o qfloat.o qbool.o json-lexer.o
 | 
			
		||||
obj-y += qemu-config.o block-migration.o
 | 
			
		||||
 | 
			
		||||
obj-$(CONFIG_BRLAPI) += baum.o
 | 
			
		||||
| 
						 | 
				
			
			
 | 
			
		|||
| 
						 | 
				
			
			@ -0,0 +1,327 @@
 | 
			
		|||
/*
 | 
			
		||||
 * JSON lexer
 | 
			
		||||
 *
 | 
			
		||||
 * Copyright IBM, Corp. 2009
 | 
			
		||||
 *
 | 
			
		||||
 * Authors:
 | 
			
		||||
 *  Anthony Liguori   <aliguori@us.ibm.com>
 | 
			
		||||
 *
 | 
			
		||||
 * This work is licensed under the terms of the GNU LGPL, version 2.1 or later.
 | 
			
		||||
 * See the COPYING.LIB file in the top-level directory.
 | 
			
		||||
 *
 | 
			
		||||
 */
 | 
			
		||||
 | 
			
		||||
#include "qstring.h"
 | 
			
		||||
#include "qlist.h"
 | 
			
		||||
#include "qdict.h"
 | 
			
		||||
#include "qint.h"
 | 
			
		||||
#include "qemu-common.h"
 | 
			
		||||
#include "json-lexer.h"
 | 
			
		||||
 | 
			
		||||
/*
 | 
			
		||||
 * \"([^\\\"]|(\\\"\\'\\\\\\/\\b\\f\\n\\r\\t\\u[0-9a-fA-F][0-9a-fA-F][0-9a-fA-F][0-9a-fA-F]))*\"
 | 
			
		||||
 * '([^\\']|(\\\"\\'\\\\\\/\\b\\f\\n\\r\\t\\u[0-9a-fA-F][0-9a-fA-F][0-9a-fA-F][0-9a-fA-F]))*'
 | 
			
		||||
 * 0|([1-9][0-9]*(.[0-9]+)?([eE]([-+])?[0-9]+))
 | 
			
		||||
 * [{}\[\],:]
 | 
			
		||||
 * [a-z]+
 | 
			
		||||
 *
 | 
			
		||||
 */
 | 
			
		||||
 | 
			
		||||
enum json_lexer_state {
 | 
			
		||||
    ERROR = 0,
 | 
			
		||||
    IN_DONE_STRING,
 | 
			
		||||
    IN_DQ_UCODE3,
 | 
			
		||||
    IN_DQ_UCODE2,
 | 
			
		||||
    IN_DQ_UCODE1,
 | 
			
		||||
    IN_DQ_UCODE0,
 | 
			
		||||
    IN_DQ_STRING_ESCAPE,
 | 
			
		||||
    IN_DQ_STRING,
 | 
			
		||||
    IN_SQ_UCODE3,
 | 
			
		||||
    IN_SQ_UCODE2,
 | 
			
		||||
    IN_SQ_UCODE1,
 | 
			
		||||
    IN_SQ_UCODE0,
 | 
			
		||||
    IN_SQ_STRING_ESCAPE,
 | 
			
		||||
    IN_SQ_STRING,
 | 
			
		||||
    IN_ZERO,
 | 
			
		||||
    IN_DIGITS,
 | 
			
		||||
    IN_DIGIT,
 | 
			
		||||
    IN_EXP_E,
 | 
			
		||||
    IN_MANTISSA,
 | 
			
		||||
    IN_MANTISSA_DIGITS,
 | 
			
		||||
    IN_NONZERO_NUMBER,
 | 
			
		||||
    IN_NEG_NONZERO_NUMBER,
 | 
			
		||||
    IN_KEYWORD,
 | 
			
		||||
    IN_ESCAPE,
 | 
			
		||||
    IN_ESCAPE_L,
 | 
			
		||||
    IN_ESCAPE_LL,
 | 
			
		||||
    IN_ESCAPE_DONE,
 | 
			
		||||
    IN_WHITESPACE,
 | 
			
		||||
    IN_OPERATOR_DONE,
 | 
			
		||||
    IN_START,
 | 
			
		||||
};
 | 
			
		||||
 | 
			
		||||
#define TERMINAL(state) [0 ... 0x7F] = (state)
 | 
			
		||||
 | 
			
		||||
static const uint8_t json_lexer[][256] =  {
 | 
			
		||||
    [IN_DONE_STRING] = {
 | 
			
		||||
        TERMINAL(JSON_STRING),
 | 
			
		||||
    },
 | 
			
		||||
 | 
			
		||||
    /* double quote string */
 | 
			
		||||
    [IN_DQ_UCODE3] = {
 | 
			
		||||
        ['0' ... '9'] = IN_DQ_STRING,
 | 
			
		||||
        ['a' ... 'f'] = IN_DQ_STRING,
 | 
			
		||||
        ['A' ... 'F'] = IN_DQ_STRING,
 | 
			
		||||
    },
 | 
			
		||||
    [IN_DQ_UCODE2] = {
 | 
			
		||||
        ['0' ... '9'] = IN_DQ_UCODE3,
 | 
			
		||||
        ['a' ... 'f'] = IN_DQ_UCODE3,
 | 
			
		||||
        ['A' ... 'F'] = IN_DQ_UCODE3,
 | 
			
		||||
    },
 | 
			
		||||
    [IN_DQ_UCODE1] = {
 | 
			
		||||
        ['0' ... '9'] = IN_DQ_UCODE2,
 | 
			
		||||
        ['a' ... 'f'] = IN_DQ_UCODE2,
 | 
			
		||||
        ['A' ... 'F'] = IN_DQ_UCODE2,
 | 
			
		||||
    },
 | 
			
		||||
    [IN_DQ_UCODE0] = {
 | 
			
		||||
        ['0' ... '9'] = IN_DQ_UCODE1,
 | 
			
		||||
        ['a' ... 'f'] = IN_DQ_UCODE1,
 | 
			
		||||
        ['A' ... 'F'] = IN_DQ_UCODE1,
 | 
			
		||||
    },
 | 
			
		||||
    [IN_DQ_STRING_ESCAPE] = {
 | 
			
		||||
        ['b'] = IN_DQ_STRING,
 | 
			
		||||
        ['f'] =  IN_DQ_STRING,
 | 
			
		||||
        ['n'] =  IN_DQ_STRING,
 | 
			
		||||
        ['r'] =  IN_DQ_STRING,
 | 
			
		||||
        ['t'] =  IN_DQ_STRING,
 | 
			
		||||
        ['\''] = IN_DQ_STRING,
 | 
			
		||||
        ['\"'] = IN_DQ_STRING,
 | 
			
		||||
        ['u'] = IN_DQ_UCODE0,
 | 
			
		||||
    },
 | 
			
		||||
    [IN_DQ_STRING] = {
 | 
			
		||||
        [1 ... 0xFF] = IN_DQ_STRING,
 | 
			
		||||
        ['\\'] = IN_DQ_STRING_ESCAPE,
 | 
			
		||||
        ['"'] = IN_DONE_STRING,
 | 
			
		||||
    },
 | 
			
		||||
 | 
			
		||||
    /* single quote string */
 | 
			
		||||
    [IN_SQ_UCODE3] = {
 | 
			
		||||
        ['0' ... '9'] = IN_SQ_STRING,
 | 
			
		||||
        ['a' ... 'f'] = IN_SQ_STRING,
 | 
			
		||||
        ['A' ... 'F'] = IN_SQ_STRING,
 | 
			
		||||
    },
 | 
			
		||||
    [IN_SQ_UCODE2] = {
 | 
			
		||||
        ['0' ... '9'] = IN_SQ_UCODE3,
 | 
			
		||||
        ['a' ... 'f'] = IN_SQ_UCODE3,
 | 
			
		||||
        ['A' ... 'F'] = IN_SQ_UCODE3,
 | 
			
		||||
    },
 | 
			
		||||
    [IN_SQ_UCODE1] = {
 | 
			
		||||
        ['0' ... '9'] = IN_SQ_UCODE2,
 | 
			
		||||
        ['a' ... 'f'] = IN_SQ_UCODE2,
 | 
			
		||||
        ['A' ... 'F'] = IN_SQ_UCODE2,
 | 
			
		||||
    },
 | 
			
		||||
    [IN_SQ_UCODE0] = {
 | 
			
		||||
        ['0' ... '9'] = IN_SQ_UCODE1,
 | 
			
		||||
        ['a' ... 'f'] = IN_SQ_UCODE1,
 | 
			
		||||
        ['A' ... 'F'] = IN_SQ_UCODE1,
 | 
			
		||||
    },
 | 
			
		||||
    [IN_SQ_STRING_ESCAPE] = {
 | 
			
		||||
        ['b'] = IN_SQ_STRING,
 | 
			
		||||
        ['f'] =  IN_SQ_STRING,
 | 
			
		||||
        ['n'] =  IN_SQ_STRING,
 | 
			
		||||
        ['r'] =  IN_SQ_STRING,
 | 
			
		||||
        ['t'] =  IN_SQ_STRING,
 | 
			
		||||
        ['\''] = IN_SQ_STRING,
 | 
			
		||||
        ['\"'] = IN_SQ_STRING,
 | 
			
		||||
        ['u'] = IN_SQ_UCODE0,
 | 
			
		||||
    },
 | 
			
		||||
    [IN_SQ_STRING] = {
 | 
			
		||||
        [1 ... 0xFF] = IN_SQ_STRING,
 | 
			
		||||
        ['\\'] = IN_SQ_STRING_ESCAPE,
 | 
			
		||||
        ['\''] = IN_DONE_STRING,
 | 
			
		||||
    },
 | 
			
		||||
 | 
			
		||||
    /* Zero */
 | 
			
		||||
    [IN_ZERO] = {
 | 
			
		||||
        TERMINAL(JSON_INTEGER),
 | 
			
		||||
        ['0' ... '9'] = ERROR,
 | 
			
		||||
        ['.'] = IN_MANTISSA,
 | 
			
		||||
    },
 | 
			
		||||
 | 
			
		||||
    /* Float */
 | 
			
		||||
    [IN_DIGITS] = {
 | 
			
		||||
        TERMINAL(JSON_FLOAT),
 | 
			
		||||
        ['0' ... '9'] = IN_DIGITS,
 | 
			
		||||
    },
 | 
			
		||||
 | 
			
		||||
    [IN_DIGIT] = {
 | 
			
		||||
        ['0' ... '9'] = IN_DIGITS,
 | 
			
		||||
    },
 | 
			
		||||
 | 
			
		||||
    [IN_EXP_E] = {
 | 
			
		||||
        ['-'] = IN_DIGIT,
 | 
			
		||||
        ['+'] = IN_DIGIT,
 | 
			
		||||
        ['0' ... '9'] = IN_DIGITS,
 | 
			
		||||
    },
 | 
			
		||||
 | 
			
		||||
    [IN_MANTISSA_DIGITS] = {
 | 
			
		||||
        TERMINAL(JSON_FLOAT),
 | 
			
		||||
        ['0' ... '9'] = IN_MANTISSA_DIGITS,
 | 
			
		||||
        ['e'] = IN_EXP_E,
 | 
			
		||||
        ['E'] = IN_EXP_E,
 | 
			
		||||
    },
 | 
			
		||||
 | 
			
		||||
    [IN_MANTISSA] = {
 | 
			
		||||
        ['0' ... '9'] = IN_MANTISSA_DIGITS,
 | 
			
		||||
    },
 | 
			
		||||
 | 
			
		||||
    /* Number */
 | 
			
		||||
    [IN_NONZERO_NUMBER] = {
 | 
			
		||||
        TERMINAL(JSON_INTEGER),
 | 
			
		||||
        ['0' ... '9'] = IN_NONZERO_NUMBER,
 | 
			
		||||
        ['e'] = IN_EXP_E,
 | 
			
		||||
        ['E'] = IN_EXP_E,
 | 
			
		||||
        ['.'] = IN_MANTISSA,
 | 
			
		||||
    },
 | 
			
		||||
 | 
			
		||||
    [IN_NEG_NONZERO_NUMBER] = {
 | 
			
		||||
        ['0'] = IN_ZERO,
 | 
			
		||||
        ['1' ... '9'] = IN_NONZERO_NUMBER,
 | 
			
		||||
    },
 | 
			
		||||
 | 
			
		||||
    /* keywords */
 | 
			
		||||
    [IN_KEYWORD] = {
 | 
			
		||||
        TERMINAL(JSON_KEYWORD),
 | 
			
		||||
        ['a' ... 'z'] = IN_KEYWORD,
 | 
			
		||||
    },
 | 
			
		||||
 | 
			
		||||
    /* whitespace */
 | 
			
		||||
    [IN_WHITESPACE] = {
 | 
			
		||||
        TERMINAL(JSON_SKIP),
 | 
			
		||||
        [' '] = IN_WHITESPACE,
 | 
			
		||||
        ['\t'] = IN_WHITESPACE,
 | 
			
		||||
        ['\r'] = IN_WHITESPACE,
 | 
			
		||||
        ['\n'] = IN_WHITESPACE,
 | 
			
		||||
    },        
 | 
			
		||||
 | 
			
		||||
    /* operator */
 | 
			
		||||
    [IN_OPERATOR_DONE] = {
 | 
			
		||||
        TERMINAL(JSON_OPERATOR),
 | 
			
		||||
    },
 | 
			
		||||
 | 
			
		||||
    /* escape */
 | 
			
		||||
    [IN_ESCAPE_DONE] = {
 | 
			
		||||
        TERMINAL(JSON_ESCAPE),
 | 
			
		||||
    },
 | 
			
		||||
 | 
			
		||||
    [IN_ESCAPE_LL] = {
 | 
			
		||||
        ['d'] = IN_ESCAPE_DONE,
 | 
			
		||||
    },
 | 
			
		||||
 | 
			
		||||
    [IN_ESCAPE_L] = {
 | 
			
		||||
        ['d'] = IN_ESCAPE_DONE,
 | 
			
		||||
        ['l'] = IN_ESCAPE_LL,
 | 
			
		||||
    },
 | 
			
		||||
 | 
			
		||||
    [IN_ESCAPE] = {
 | 
			
		||||
        ['d'] = IN_ESCAPE_DONE,
 | 
			
		||||
        ['i'] = IN_ESCAPE_DONE,
 | 
			
		||||
        ['p'] = IN_ESCAPE_DONE,
 | 
			
		||||
        ['s'] = IN_ESCAPE_DONE,
 | 
			
		||||
        ['f'] = IN_ESCAPE_DONE,
 | 
			
		||||
        ['l'] = IN_ESCAPE_L,
 | 
			
		||||
    },
 | 
			
		||||
 | 
			
		||||
    /* top level rule */
 | 
			
		||||
    [IN_START] = {
 | 
			
		||||
        ['"'] = IN_DQ_STRING,
 | 
			
		||||
        ['\''] = IN_SQ_STRING,
 | 
			
		||||
        ['0'] = IN_ZERO,
 | 
			
		||||
        ['1' ... '9'] = IN_NONZERO_NUMBER,
 | 
			
		||||
        ['-'] = IN_NEG_NONZERO_NUMBER,
 | 
			
		||||
        ['{'] = IN_OPERATOR_DONE,
 | 
			
		||||
        ['}'] = IN_OPERATOR_DONE,
 | 
			
		||||
        ['['] = IN_OPERATOR_DONE,
 | 
			
		||||
        [']'] = IN_OPERATOR_DONE,
 | 
			
		||||
        [','] = IN_OPERATOR_DONE,
 | 
			
		||||
        [':'] = IN_OPERATOR_DONE,
 | 
			
		||||
        ['a' ... 'z'] = IN_KEYWORD,
 | 
			
		||||
        ['%'] = IN_ESCAPE,
 | 
			
		||||
        [' '] = IN_WHITESPACE,
 | 
			
		||||
        ['\t'] = IN_WHITESPACE,
 | 
			
		||||
        ['\r'] = IN_WHITESPACE,
 | 
			
		||||
        ['\n'] = IN_WHITESPACE,
 | 
			
		||||
    },
 | 
			
		||||
};
 | 
			
		||||
 | 
			
		||||
void json_lexer_init(JSONLexer *lexer, JSONLexerEmitter func)
 | 
			
		||||
{
 | 
			
		||||
    lexer->emit = func;
 | 
			
		||||
    lexer->state = IN_START;
 | 
			
		||||
    lexer->token = qstring_new();
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
static int json_lexer_feed_char(JSONLexer *lexer, char ch)
 | 
			
		||||
{
 | 
			
		||||
    char buf[2];
 | 
			
		||||
 | 
			
		||||
    lexer->x++;
 | 
			
		||||
    if (ch == '\n') {
 | 
			
		||||
        lexer->x = 0;
 | 
			
		||||
        lexer->y++;
 | 
			
		||||
    }
 | 
			
		||||
 | 
			
		||||
    lexer->state = json_lexer[lexer->state][(uint8_t)ch];
 | 
			
		||||
 | 
			
		||||
    switch (lexer->state) {
 | 
			
		||||
    case JSON_OPERATOR:
 | 
			
		||||
    case JSON_ESCAPE:
 | 
			
		||||
    case JSON_INTEGER:
 | 
			
		||||
    case JSON_FLOAT:
 | 
			
		||||
    case JSON_KEYWORD:
 | 
			
		||||
    case JSON_STRING:
 | 
			
		||||
        lexer->emit(lexer, lexer->token, lexer->state, lexer->x, lexer->y);
 | 
			
		||||
    case JSON_SKIP:
 | 
			
		||||
        lexer->state = json_lexer[IN_START][(uint8_t)ch];
 | 
			
		||||
        QDECREF(lexer->token);
 | 
			
		||||
        lexer->token = qstring_new();
 | 
			
		||||
        break;
 | 
			
		||||
    case ERROR:
 | 
			
		||||
        return -EINVAL;
 | 
			
		||||
    default:
 | 
			
		||||
        break;
 | 
			
		||||
    }
 | 
			
		||||
 | 
			
		||||
    buf[0] = ch;
 | 
			
		||||
    buf[1] = 0;
 | 
			
		||||
 | 
			
		||||
    qstring_append(lexer->token, buf);
 | 
			
		||||
 | 
			
		||||
    return 0;
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
int json_lexer_feed(JSONLexer *lexer, const char *buffer, size_t size)
 | 
			
		||||
{
 | 
			
		||||
    size_t i;
 | 
			
		||||
 | 
			
		||||
    for (i = 0; i < size; i++) {
 | 
			
		||||
        int err;
 | 
			
		||||
 | 
			
		||||
        err = json_lexer_feed_char(lexer, buffer[i]);
 | 
			
		||||
        if (err < 0) {
 | 
			
		||||
            return err;
 | 
			
		||||
        }
 | 
			
		||||
    }
 | 
			
		||||
 | 
			
		||||
    return 0;
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
int json_lexer_flush(JSONLexer *lexer)
 | 
			
		||||
{
 | 
			
		||||
    return json_lexer_feed_char(lexer, 0);
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
void json_lexer_destroy(JSONLexer *lexer)
 | 
			
		||||
{
 | 
			
		||||
    QDECREF(lexer->token);
 | 
			
		||||
}
 | 
			
		||||
| 
						 | 
				
			
			@ -0,0 +1,50 @@
 | 
			
		|||
/*
 | 
			
		||||
 * JSON lexer
 | 
			
		||||
 *
 | 
			
		||||
 * Copyright IBM, Corp. 2009
 | 
			
		||||
 *
 | 
			
		||||
 * Authors:
 | 
			
		||||
 *  Anthony Liguori   <aliguori@us.ibm.com>
 | 
			
		||||
 *
 | 
			
		||||
 * This work is licensed under the terms of the GNU LGPL, version 2.1 or later.
 | 
			
		||||
 * See the COPYING.LIB file in the top-level directory.
 | 
			
		||||
 *
 | 
			
		||||
 */
 | 
			
		||||
 | 
			
		||||
#ifndef QEMU_JSON_LEXER_H
 | 
			
		||||
#define QEMU_JSON_LEXER_H
 | 
			
		||||
 | 
			
		||||
#include "qstring.h"
 | 
			
		||||
#include "qlist.h"
 | 
			
		||||
 | 
			
		||||
typedef enum json_token_type {
 | 
			
		||||
    JSON_OPERATOR = 100,
 | 
			
		||||
    JSON_INTEGER,
 | 
			
		||||
    JSON_FLOAT,
 | 
			
		||||
    JSON_KEYWORD,
 | 
			
		||||
    JSON_STRING,
 | 
			
		||||
    JSON_ESCAPE,
 | 
			
		||||
    JSON_SKIP,
 | 
			
		||||
} JSONTokenType;
 | 
			
		||||
 | 
			
		||||
typedef struct JSONLexer JSONLexer;
 | 
			
		||||
 | 
			
		||||
typedef void (JSONLexerEmitter)(JSONLexer *, QString *, JSONTokenType, int x, int y);
 | 
			
		||||
 | 
			
		||||
struct JSONLexer
 | 
			
		||||
{
 | 
			
		||||
    JSONLexerEmitter *emit;
 | 
			
		||||
    int state;
 | 
			
		||||
    QString *token;
 | 
			
		||||
    int x, y;
 | 
			
		||||
};
 | 
			
		||||
 | 
			
		||||
void json_lexer_init(JSONLexer *lexer, JSONLexerEmitter func);
 | 
			
		||||
 | 
			
		||||
int json_lexer_feed(JSONLexer *lexer, const char *buffer, size_t size);
 | 
			
		||||
 | 
			
		||||
int json_lexer_flush(JSONLexer *lexer);
 | 
			
		||||
 | 
			
		||||
void json_lexer_destroy(JSONLexer *lexer);
 | 
			
		||||
 | 
			
		||||
#endif
 | 
			
		||||
		Loading…
	
		Reference in New Issue