+added utf8 token support.

This commit is contained in:
paladin-t 2016-04-22 17:14:02 +08:00
parent 403238fa45
commit f529ffb4e1
6 changed files with 92 additions and 14 deletions

View File

@ -1,3 +1,6 @@
Apr. 22 2016
Added UTF8 token support
Apr. 19 2016 Apr. 19 2016
Fixed an error raising issue with incomplete IF structure Fixed an error raising issue with incomplete IF structure

Binary file not shown.

View File

@ -42,6 +42,7 @@ MY-BASIC is a dynamic typed programming language with BASIC syntax and has a ver
* **Lightweight** (within memory usage less than 128KB), fast, and cuttable * **Lightweight** (within memory usage less than 128KB), fast, and cuttable
* With most both retro and modern BASIC syntax * With most both retro and modern BASIC syntax
* Case-insensitive tokenization, and many other indelible BASIC feelings * Case-insensitive tokenization, and many other indelible BASIC feelings
* Unicode support
* **[Prototype-based programming](https://en.wikipedia.org/wiki/Prototype-based_programming)** (OOP) paradigm, with reflection support * **[Prototype-based programming](https://en.wikipedia.org/wiki/Prototype-based_programming)** (OOP) paradigm, with reflection support
* **[Lambda abstraction](https://en.wikipedia.org/wiki/Anonymous_function)** enhanced functional programming * **[Lambda abstraction](https://en.wikipedia.org/wiki/Anonymous_function)** enhanced functional programming
* **Dynamic typed** integer, float point, string, boolean, user defined data types, etc. with array support * **Dynamic typed** integer, float point, string, boolean, user defined data types, etc. with array support

View File

@ -1191,6 +1191,7 @@ static char* mb_strupr(char* s);
/** Unicode handling */ /** Unicode handling */
#ifdef MB_ENABLE_UNICODE #ifdef MB_ENABLE_UNICODE
static int mb_uu_getbom(const char** ch);
static int mb_uu_ischar(const char* ch); static int mb_uu_ischar(const char* ch);
static int mb_uu_strlen(const char* ch); static int mb_uu_strlen(const char* ch);
static int mb_uu_substr(const char* ch, int begin, int count, char** o); static int mb_uu_substr(const char* ch, int begin, int count, char** o);
@ -1308,11 +1309,14 @@ static bool_t _is_using_char(char c);
static bool_t _is_exponent_prefix(char* s, int begin, int end); static bool_t _is_exponent_prefix(char* s, int begin, int end);
static int _append_char_to_symbol(mb_interpreter_t* s, char c); static int _append_char_to_symbol(mb_interpreter_t* s, char c);
#ifdef MB_ENABLE_UNICODE_ID
static int _append_uu_char_to_symbol(mb_interpreter_t* s, const char* str, int n);
#endif /* MB_ENABLE_UNICODE_ID */
static int _cut_symbol(mb_interpreter_t* s, int pos, unsigned short row, unsigned short col); static int _cut_symbol(mb_interpreter_t* s, int pos, unsigned short row, unsigned short col);
static int _append_symbol(mb_interpreter_t* s, char* sym, bool_t* delsym, int pos, unsigned short row, unsigned short col); static int _append_symbol(mb_interpreter_t* s, char* sym, bool_t* delsym, int pos, unsigned short row, unsigned short col);
static int _create_symbol(mb_interpreter_t* s, _ls_node_t* l, char* sym, _object_t** obj, _ls_node_t*** asgn, bool_t* delsym); static int _create_symbol(mb_interpreter_t* s, _ls_node_t* l, char* sym, _object_t** obj, _ls_node_t*** asgn, bool_t* delsym);
static _data_e _get_symbol_type(mb_interpreter_t* s, char* sym, _raw_t* value); static _data_e _get_symbol_type(mb_interpreter_t* s, char* sym, _raw_t* value);
static int _parse_char(mb_interpreter_t* s, const char** str, int pos, unsigned short row, unsigned short col); static int _parse_char(mb_interpreter_t* s, const char* str, int n, int pos, unsigned short row, unsigned short col);
static void _set_error_pos(mb_interpreter_t* s, int pos, unsigned short row, unsigned short col); static void _set_error_pos(mb_interpreter_t* s, int pos, unsigned short row, unsigned short col);
static char* _prev_import(mb_interpreter_t* s, char* lf, int* pos, unsigned short* row, unsigned short* col); static char* _prev_import(mb_interpreter_t* s, char* lf, int* pos, unsigned short* row, unsigned short* col);
static char* _post_import(mb_interpreter_t* s, char* lf, int* pos, unsigned short* row, unsigned short* col); static char* _post_import(mb_interpreter_t* s, char* lf, int* pos, unsigned short* row, unsigned short* col);
@ -2914,6 +2918,24 @@ static char* mb_strupr(char* s) {
/** Unicode handling */ /** Unicode handling */
#ifdef MB_ENABLE_UNICODE #ifdef MB_ENABLE_UNICODE
/* Determine whether a string begins with a BOM, and ignore it */
static int mb_uu_getbom(const char** ch) {
if(!ch && !(*ch))
return 0;
if((*ch)[0] == -17 && (*ch)[1] == -69 && (*ch)[2] == -65) {
*ch += 3;
return 3;
} else if((*ch)[0] == -2 && (*ch)[1] == -1) {
*ch += 2;
return 2;
}
return 0;
}
/* Determine whether a buffer is a UTF8 encoded character, and return taken bytes */ /* Determine whether a buffer is a UTF8 encoded character, and return taken bytes */
static int mb_uu_ischar(const char* ch) { static int mb_uu_ischar(const char* ch) {
/* Copyright 2008, 2009 Bjoern Hoehrmann, http://bjoern.hoehrmann.de/utf-8/decoder/dfa/ */ /* Copyright 2008, 2009 Bjoern Hoehrmann, http://bjoern.hoehrmann.de/utf-8/decoder/dfa/ */
@ -4274,10 +4296,7 @@ static void _end_of_file(_parsing_context_t* context) {
/* Determine whether a character is blank */ /* Determine whether a character is blank */
static bool_t _is_blank_char(char c) { static bool_t _is_blank_char(char c) {
return return (c == ' ') || (c == '\t');
(c == ' ') || (c == '\t') ||
(c == -17) || (c == -69) || (c == -65) ||
(c == -2) || (c == -1);
} }
/* Determine whether a character is end of file */ /* Determine whether a character is end of file */
@ -4381,7 +4400,7 @@ static int _append_char_to_symbol(mb_interpreter_t* s, char c) {
if(context->current_symbol_nonius + 1 >= _SINGLE_SYMBOL_MAX_LENGTH) { if(context->current_symbol_nonius + 1 >= _SINGLE_SYMBOL_MAX_LENGTH) {
_set_current_error(s, SE_PS_SYMBOL_TOO_LONG, 0); _set_current_error(s, SE_PS_SYMBOL_TOO_LONG, 0);
++result; result = MB_FUNC_ERR;
} else { } else {
context->current_symbol[context->current_symbol_nonius] = c; context->current_symbol[context->current_symbol_nonius] = c;
++context->current_symbol_nonius; ++context->current_symbol_nonius;
@ -4390,6 +4409,29 @@ static int _append_char_to_symbol(mb_interpreter_t* s, char c) {
return result; return result;
} }
#ifdef MB_ENABLE_UNICODE_ID
/* Parse a UTF8 character and append it to current parsing symbol */
static int _append_uu_char_to_symbol(mb_interpreter_t* s, const char* str, int n) {
int result = MB_FUNC_OK;
_parsing_context_t* context = 0;
mb_assert(s);
context = s->parsing_context;
if(context->current_symbol_nonius + n >= _SINGLE_SYMBOL_MAX_LENGTH) {
_set_current_error(s, SE_PS_SYMBOL_TOO_LONG, 0);
result = MB_FUNC_ERR;
} else {
memcpy(&context->current_symbol[context->current_symbol_nonius], str, n);
context->current_symbol_nonius += n;
}
return result;
}
#endif /* MB_ENABLE_UNICODE_ID */
/* Cut current symbol when current one parsing is finished */ /* Cut current symbol when current one parsing is finished */
static int _cut_symbol(mb_interpreter_t* s, int pos, unsigned short row, unsigned short col) { static int _cut_symbol(mb_interpreter_t* s, int pos, unsigned short row, unsigned short col) {
int result = MB_FUNC_OK; int result = MB_FUNC_OK;
@ -5027,19 +5069,28 @@ _exit:
} }
/* Parse a character */ /* Parse a character */
static int _parse_char(mb_interpreter_t* s, const char** str, int pos, unsigned short row, unsigned short col) { static int _parse_char(mb_interpreter_t* s, const char* str, int n, int pos, unsigned short row, unsigned short col) {
int result = MB_FUNC_OK; int result = MB_FUNC_OK;
_parsing_context_t* context = 0; _parsing_context_t* context = 0;
char last_char = _ZERO_CHAR; char last_char = _ZERO_CHAR;
char c = '\0'; char c = '\0';
#ifdef MB_ENABLE_UNICODE_ID
unsigned uc = 0;
#endif /* MB_ENABLE_UNICODE_ID */
mb_assert(s && s->parsing_context); mb_assert(s && s->parsing_context);
context = s->parsing_context; context = s->parsing_context;
if(str && *str) { if(str) {
c = **str; #ifdef MB_ENABLE_UNICODE_ID
++(*str); if(n == 1)
c = *str;
else
memcpy(&uc, str, n);
#else /* MB_ENABLE_UNICODE_ID */
c = *str;
#endif /* MB_ENABLE_UNICODE_ID */
} else { } else {
c = MB_EOS; c = MB_EOS;
} }
@ -5049,8 +5100,15 @@ static int _parse_char(mb_interpreter_t* s, const char** str, int pos, unsigned
switch(context->parsing_state) { switch(context->parsing_state) {
case _PS_NORMAL: case _PS_NORMAL:
c = toupper(c); #ifdef MB_ENABLE_UNICODE_ID
if(uc) {
_mb_check(result = _append_uu_char_to_symbol(s, str, n), _exit);
break;
}
#endif /* MB_ENABLE_UNICODE_ID */
c = toupper(c);
if(_is_blank_char(c)) { /* \t space */ if(_is_blank_char(c)) { /* \t space */
_mb_check(result = _cut_symbol(s, pos, row, col), _exit); _mb_check(result = _cut_symbol(s, pos, row, col), _exit);
} else if(_is_newline_char(c)) { /* \r \n EOF */ } else if(_is_newline_char(c)) { /* \r \n EOF */
@ -11803,7 +11861,15 @@ int mb_load_string(struct mb_interpreter_t* s, const char* l, bool_t reset) {
context = s->parsing_context; context = s->parsing_context;
#ifdef MB_ENABLE_UNICODE
mb_uu_getbom(&l);
#endif /* MB_ENABLE_UNICODE */
while(*l) { while(*l) {
int n = 1;
#ifdef MB_ENABLE_UNICODE_ID
if(context->parsing_state == _PS_NORMAL)
n = mb_uu_ischar(l);
#endif /* MB_ENABLE_UNICODE_ID */
ch = *l; ch = *l;
if((ch == _NEWLINE_CHAR || ch == _RETURN_CHAR) && (!wrapped || wrapped == ch)) { if((ch == _NEWLINE_CHAR || ch == _RETURN_CHAR) && (!wrapped || wrapped == ch)) {
wrapped = ch; wrapped = ch;
@ -11813,7 +11879,7 @@ int mb_load_string(struct mb_interpreter_t* s, const char* l, bool_t reset) {
wrapped = _ZERO_CHAR; wrapped = _ZERO_CHAR;
++context->parsing_col; ++context->parsing_col;
} }
status = _parse_char(s, &l, context->parsing_pos, _row, _col); status = _parse_char(s, l, n, context->parsing_pos, _row, _col);
result = status; result = status;
if(status) { if(status) {
_set_error_pos(s, context->parsing_pos, _row, _col); _set_error_pos(s, context->parsing_pos, _row, _col);
@ -11824,8 +11890,9 @@ int mb_load_string(struct mb_interpreter_t* s, const char* l, bool_t reset) {
_row = context->parsing_row; _row = context->parsing_row;
_col = context->parsing_col; _col = context->parsing_col;
++context->parsing_pos; ++context->parsing_pos;
l += n;
}; };
status = _parse_char(s, 0, context->parsing_pos, context->parsing_row, context->parsing_col); status = _parse_char(s, 0, 1, context->parsing_pos, context->parsing_row, context->parsing_col);
_exit: _exit:
if(reset) if(reset)
@ -11856,7 +11923,7 @@ int mb_load_file(struct mb_interpreter_t* s, const char* f) {
} else { } else {
_set_current_error(s, SE_PS_FILE_OPEN_FAILED, 0); _set_current_error(s, SE_PS_FILE_OPEN_FAILED, 0);
++result; result = MB_FUNC_ERR;
} }
_exit: _exit:

View File

@ -124,6 +124,13 @@ extern "C" {
# define MB_ENABLE_UNICODE # define MB_ENABLE_UNICODE
#endif /* MB_ENABLE_UNICODE */ #endif /* MB_ENABLE_UNICODE */
#ifndef MB_ENABLE_UNICODE_ID
# define MB_ENABLE_UNICODE_ID
# if defined MB_ENABLE_UNICODE_ID && !defined MB_ENABLE_UNICODE
# error "Requires MB_ENABLE_UNICODE enabled."
# endif
#endif /* MB_ENABLE_UNICODE_ID */
#ifndef MB_GC_GARBAGE_THRESHOLD #ifndef MB_GC_GARBAGE_THRESHOLD
# define MB_GC_GARBAGE_THRESHOLD 16 # define MB_GC_GARBAGE_THRESHOLD 16
#endif /* MB_GC_GARBAGE_THRESHOLD */ #endif /* MB_GC_GARBAGE_THRESHOLD */

Binary file not shown.