diff --git a/HISTORY b/HISTORY index 25d4da5..29bf732 100755 --- a/HISTORY +++ b/HISTORY @@ -1,3 +1,6 @@ +Apr. 22 2016 +Added UTF8 token support + Apr. 19 2016 Fixed an error raising issue with incomplete IF structure diff --git a/MY-BASIC Quick Reference.pdf b/MY-BASIC Quick Reference.pdf index 8b5687f..997d4f5 100644 Binary files a/MY-BASIC Quick Reference.pdf and b/MY-BASIC Quick Reference.pdf differ diff --git a/README.md b/README.md index c7b559d..3d0b148 100755 --- a/README.md +++ b/README.md @@ -42,6 +42,7 @@ MY-BASIC is a dynamic typed programming language with BASIC syntax and has a ver * **Lightweight** (within memory usage less than 128KB), fast, and cuttable * With most both retro and modern BASIC syntax * Case-insensitive tokenization, and many other indelible BASIC feelings +* Unicode support * **[Prototype-based programming](https://en.wikipedia.org/wiki/Prototype-based_programming)** (OOP) paradigm, with reflection support * **[Lambda abstraction](https://en.wikipedia.org/wiki/Anonymous_function)** enhanced functional programming * **Dynamic typed** integer, float point, string, boolean, user defined data types, etc. with array support diff --git a/core/my_basic.c b/core/my_basic.c index 21961c8..7641f19 100755 --- a/core/my_basic.c +++ b/core/my_basic.c @@ -1191,6 +1191,7 @@ static char* mb_strupr(char* s); /** Unicode handling */ #ifdef MB_ENABLE_UNICODE +static int mb_uu_getbom(const char** ch); static int mb_uu_ischar(const char* ch); static int mb_uu_strlen(const char* ch); static int mb_uu_substr(const char* ch, int begin, int count, char** o); @@ -1308,11 +1309,14 @@ static bool_t _is_using_char(char c); static bool_t _is_exponent_prefix(char* s, int begin, int end); static int _append_char_to_symbol(mb_interpreter_t* s, char c); +#ifdef MB_ENABLE_UNICODE_ID +static int _append_uu_char_to_symbol(mb_interpreter_t* s, const char* str, int n); +#endif /* MB_ENABLE_UNICODE_ID */ static int _cut_symbol(mb_interpreter_t* s, int pos, unsigned short row, unsigned short col); static int _append_symbol(mb_interpreter_t* s, char* sym, bool_t* delsym, int pos, unsigned short row, unsigned short col); static int _create_symbol(mb_interpreter_t* s, _ls_node_t* l, char* sym, _object_t** obj, _ls_node_t*** asgn, bool_t* delsym); static _data_e _get_symbol_type(mb_interpreter_t* s, char* sym, _raw_t* value); -static int _parse_char(mb_interpreter_t* s, const char** str, int pos, unsigned short row, unsigned short col); +static int _parse_char(mb_interpreter_t* s, const char* str, int n, int pos, unsigned short row, unsigned short col); static void _set_error_pos(mb_interpreter_t* s, int pos, unsigned short row, unsigned short col); static char* _prev_import(mb_interpreter_t* s, char* lf, int* pos, unsigned short* row, unsigned short* col); static char* _post_import(mb_interpreter_t* s, char* lf, int* pos, unsigned short* row, unsigned short* col); @@ -2914,6 +2918,24 @@ static char* mb_strupr(char* s) { /** Unicode handling */ #ifdef MB_ENABLE_UNICODE +/* Determine whether a string begins with a BOM, and ignore it */ +static int mb_uu_getbom(const char** ch) { + if(!ch && !(*ch)) + return 0; + + if((*ch)[0] == -17 && (*ch)[1] == -69 && (*ch)[2] == -65) { + *ch += 3; + + return 3; + } else if((*ch)[0] == -2 && (*ch)[1] == -1) { + *ch += 2; + + return 2; + } + + return 0; +} + /* Determine whether a buffer is a UTF8 encoded character, and return taken bytes */ static int mb_uu_ischar(const char* ch) { /* Copyright 2008, 2009 Bjoern Hoehrmann, http://bjoern.hoehrmann.de/utf-8/decoder/dfa/ */ @@ -4274,10 +4296,7 @@ static void _end_of_file(_parsing_context_t* context) { /* Determine whether a character is blank */ static bool_t _is_blank_char(char c) { - return - (c == ' ') || (c == '\t') || - (c == -17) || (c == -69) || (c == -65) || - (c == -2) || (c == -1); + return (c == ' ') || (c == '\t'); } /* Determine whether a character is end of file */ @@ -4381,7 +4400,7 @@ static int _append_char_to_symbol(mb_interpreter_t* s, char c) { if(context->current_symbol_nonius + 1 >= _SINGLE_SYMBOL_MAX_LENGTH) { _set_current_error(s, SE_PS_SYMBOL_TOO_LONG, 0); - ++result; + result = MB_FUNC_ERR; } else { context->current_symbol[context->current_symbol_nonius] = c; ++context->current_symbol_nonius; @@ -4390,6 +4409,29 @@ static int _append_char_to_symbol(mb_interpreter_t* s, char c) { return result; } +#ifdef MB_ENABLE_UNICODE_ID +/* Parse a UTF8 character and append it to current parsing symbol */ +static int _append_uu_char_to_symbol(mb_interpreter_t* s, const char* str, int n) { + int result = MB_FUNC_OK; + _parsing_context_t* context = 0; + + mb_assert(s); + + context = s->parsing_context; + + if(context->current_symbol_nonius + n >= _SINGLE_SYMBOL_MAX_LENGTH) { + _set_current_error(s, SE_PS_SYMBOL_TOO_LONG, 0); + + result = MB_FUNC_ERR; + } else { + memcpy(&context->current_symbol[context->current_symbol_nonius], str, n); + context->current_symbol_nonius += n; + } + + return result; +} +#endif /* MB_ENABLE_UNICODE_ID */ + /* Cut current symbol when current one parsing is finished */ static int _cut_symbol(mb_interpreter_t* s, int pos, unsigned short row, unsigned short col) { int result = MB_FUNC_OK; @@ -5027,19 +5069,28 @@ _exit: } /* Parse a character */ -static int _parse_char(mb_interpreter_t* s, const char** str, int pos, unsigned short row, unsigned short col) { +static int _parse_char(mb_interpreter_t* s, const char* str, int n, int pos, unsigned short row, unsigned short col) { int result = MB_FUNC_OK; _parsing_context_t* context = 0; char last_char = _ZERO_CHAR; char c = '\0'; +#ifdef MB_ENABLE_UNICODE_ID + unsigned uc = 0; +#endif /* MB_ENABLE_UNICODE_ID */ mb_assert(s && s->parsing_context); context = s->parsing_context; - if(str && *str) { - c = **str; - ++(*str); + if(str) { +#ifdef MB_ENABLE_UNICODE_ID + if(n == 1) + c = *str; + else + memcpy(&uc, str, n); +#else /* MB_ENABLE_UNICODE_ID */ + c = *str; +#endif /* MB_ENABLE_UNICODE_ID */ } else { c = MB_EOS; } @@ -5049,8 +5100,15 @@ static int _parse_char(mb_interpreter_t* s, const char** str, int pos, unsigned switch(context->parsing_state) { case _PS_NORMAL: - c = toupper(c); +#ifdef MB_ENABLE_UNICODE_ID + if(uc) { + _mb_check(result = _append_uu_char_to_symbol(s, str, n), _exit); + break; + } +#endif /* MB_ENABLE_UNICODE_ID */ + + c = toupper(c); if(_is_blank_char(c)) { /* \t space */ _mb_check(result = _cut_symbol(s, pos, row, col), _exit); } else if(_is_newline_char(c)) { /* \r \n EOF */ @@ -11803,7 +11861,15 @@ int mb_load_string(struct mb_interpreter_t* s, const char* l, bool_t reset) { context = s->parsing_context; +#ifdef MB_ENABLE_UNICODE + mb_uu_getbom(&l); +#endif /* MB_ENABLE_UNICODE */ while(*l) { + int n = 1; +#ifdef MB_ENABLE_UNICODE_ID + if(context->parsing_state == _PS_NORMAL) + n = mb_uu_ischar(l); +#endif /* MB_ENABLE_UNICODE_ID */ ch = *l; if((ch == _NEWLINE_CHAR || ch == _RETURN_CHAR) && (!wrapped || wrapped == ch)) { wrapped = ch; @@ -11813,7 +11879,7 @@ int mb_load_string(struct mb_interpreter_t* s, const char* l, bool_t reset) { wrapped = _ZERO_CHAR; ++context->parsing_col; } - status = _parse_char(s, &l, context->parsing_pos, _row, _col); + status = _parse_char(s, l, n, context->parsing_pos, _row, _col); result = status; if(status) { _set_error_pos(s, context->parsing_pos, _row, _col); @@ -11824,8 +11890,9 @@ int mb_load_string(struct mb_interpreter_t* s, const char* l, bool_t reset) { _row = context->parsing_row; _col = context->parsing_col; ++context->parsing_pos; + l += n; }; - status = _parse_char(s, 0, context->parsing_pos, context->parsing_row, context->parsing_col); + status = _parse_char(s, 0, 1, context->parsing_pos, context->parsing_row, context->parsing_col); _exit: if(reset) @@ -11856,7 +11923,7 @@ int mb_load_file(struct mb_interpreter_t* s, const char* f) { } else { _set_current_error(s, SE_PS_FILE_OPEN_FAILED, 0); - ++result; + result = MB_FUNC_ERR; } _exit: diff --git a/core/my_basic.h b/core/my_basic.h index 749c61b..f1cdf99 100755 --- a/core/my_basic.h +++ b/core/my_basic.h @@ -124,6 +124,13 @@ extern "C" { # define MB_ENABLE_UNICODE #endif /* MB_ENABLE_UNICODE */ +#ifndef MB_ENABLE_UNICODE_ID +# define MB_ENABLE_UNICODE_ID +# if defined MB_ENABLE_UNICODE_ID && !defined MB_ENABLE_UNICODE +# error "Requires MB_ENABLE_UNICODE enabled." +# endif +#endif /* MB_ENABLE_UNICODE_ID */ + #ifndef MB_GC_GARBAGE_THRESHOLD # define MB_GC_GARBAGE_THRESHOLD 16 #endif /* MB_GC_GARBAGE_THRESHOLD */ diff --git a/output/my_basic.exe b/output/my_basic.exe index 00cb15b..ae32139 100755 Binary files a/output/my_basic.exe and b/output/my_basic.exe differ