+added UTF8 support for string manipulation.

This commit is contained in:
Wang Renxin 2015-09-29 13:04:01 +08:00
parent f58e0f4f4c
commit e8a1f62c64

View File

@ -79,7 +79,7 @@ extern "C" {
/** Macros */ /** Macros */
#define _VER_MAJOR 1 #define _VER_MAJOR 1
#define _VER_MINOR 1 #define _VER_MINOR 1
#define _VER_REVISION 75 #define _VER_REVISION 76
#define _VER_SUFFIX #define _VER_SUFFIX
#define _MB_VERSION ((_VER_MAJOR * 0x01000000) + (_VER_MINOR * 0x00010000) + (_VER_REVISION)) #define _MB_VERSION ((_VER_MAJOR * 0x01000000) + (_VER_MINOR * 0x00010000) + (_VER_REVISION))
#define _STRINGIZE(A) _MAKE_STRINGIZE(A) #define _STRINGIZE(A) _MAKE_STRINGIZE(A)
@ -112,6 +112,7 @@ extern "C" {
#endif /* toupper */ #endif /* toupper */
#define DON(__o) ((__o) ? ((_object_t*)((__o)->data)) : 0) #define DON(__o) ((__o) ? ((_object_t*)((__o)->data)) : 0)
#define TON(__t) (((__t) && *(__t)) ? ((_object_t*)(((_tuple3_t*)(*(__t)))->e1)) : 0)
#define _IS_EOS(__o) (__o && ((_object_t*)(__o))->type == _DT_EOS) #define _IS_EOS(__o) (__o && ((_object_t*)(__o))->type == _DT_EOS)
#define _IS_SEP(__o, __c) (((_object_t*)(__o))->type == _DT_SEP && ((_object_t*)(__o))->data.separator == __c) #define _IS_SEP(__o, __c) (((_object_t*)(__o))->type == _DT_SEP && ((_object_t*)(__o))->data.separator == __c)
@ -178,6 +179,7 @@ static const char* _ERR_DESC[] = {
"Syntax error", "Syntax error",
"Invalid data type", "Invalid data type",
"Type does not match", "Type does not match",
"Invalid string",
"Illegal bound", "Illegal bound",
"Too much dimensions", "Too much dimensions",
"Operation failed", "Operation failed",
@ -747,6 +749,13 @@ static char* mb_strupr(char* s);
#define safe_free(__p) do { if(__p) { mb_free(__p); __p = 0; } else { mb_assert(0 && "Memory already released"); } } while(0) #define safe_free(__p) do { if(__p) { mb_free(__p); __p = 0; } else { mb_assert(0 && "Memory already released"); } } while(0)
/** Unicode handling */
#ifdef MB_ENABLE_UNICODE
static int mb_uu_ischar(char* ch);
static int mb_uu_strlen(char* ch);
static int mb_uu_substr(char* ch, int begin, int count, char** o);
#endif /* MB_ENABLE_UNICODE */
/** Expression processing */ /** Expression processing */
static bool_t _is_operator(mb_func_t op); static bool_t _is_operator(mb_func_t op);
static bool_t _is_flow(mb_func_t op); static bool_t _is_flow(mb_func_t op);
@ -1534,6 +1543,120 @@ char* mb_strupr(char* s) {
return t; return t;
} }
/** Unicode handling */
#ifdef MB_ENABLE_UNICODE
int mb_uu_ischar(char* ch) {
/* Determine whether a buffer is a UTF8 encoded character, and return _TAKEn bytes */
#define _TAKE(__ch, __c, __r) do { __c = *__ch++; __r++; } while(0)
#define _COPY(__ch, __c, __r, __cp) do { _TAKE(__ch, __c, __r); __cp = (__cp << 6) | ((unsigned char)__c & 0x3Fu); } while(0)
#define _TRANS(__m, __cp, __g) do { __cp &= ((__g[(unsigned char)c] & __m) != 0); } while(0)
#define _TAIL(__ch, __c, __r, __cp, __g) do { _COPY(__ch, __c, __r, __cp); _TRANS(0x70, __cp, __g); } while(0)
static const unsigned char range[] = {
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0x10, 0x10, 0x10, 0x10, 0x10, 0x10, 0x10, 0x10, 0x10, 0x10, 0x10, 0x10, 0x10, 0x10, 0x10, 0x10,
0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40,
0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
8, 8, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
10, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 4, 3, 3, 11, 6, 6, 6, 5, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8
};
int result = 0;
unsigned codepoint = 0;
unsigned char type = 0;
char c = 0;
if(!ch)
return 0;
_TAKE(ch, c, result);
if(!(c & 0x80)) {
codepoint = (unsigned char)c;
return 1;
}
type = range[(unsigned char)c];
codepoint = (0xFF >> type) & (unsigned char)c;
switch (type) {
case 2: _TAIL(ch, c, result, codepoint, range); return result;
case 3: _TAIL(ch, c, result, codepoint, range); _TAIL(ch, c, result, codepoint, range); return result;
case 4: _COPY(ch, c, result, codepoint); _TRANS(0x50, codepoint, range); _TAIL(ch, c, result, codepoint, range); return result;
case 5: _COPY(ch, c, result, codepoint); _TRANS(0x10, codepoint, range); _TAIL(ch, c, result, codepoint, range); _TAIL(ch, c, result, codepoint, range); return result;
case 6: _TAIL(ch, c, result, codepoint, range); _TAIL(ch, c, result, codepoint, range); _TAIL(ch, c, result, codepoint, range); return result;
case 10: _COPY(ch, c, result, codepoint); _TRANS(0x20, codepoint, range); _TAIL(ch, c, result, codepoint, range); return result;
case 11: _COPY(ch, c, result, codepoint); _TRANS(0x60, codepoint, range); _TAIL(ch, c, result, codepoint, range); _TAIL(ch, c, result, codepoint, range); return result;
default: return 0;
}
#undef _TAKE
#undef _COPY
#undef _TRANS
#undef _TAIL
}
int mb_uu_strlen(char* ch) {
/* Tell how many UTF8 character are there in a string */
int result = 0;
if(!ch)
return 0;
while(*ch) {
int t = mb_uu_ischar(ch);
if(t <= 0) return t;
ch += t;
result++;
}
return result;
}
int mb_uu_substr(char* ch, int begin, int count, char** o) {
/* Retrieve a sub string of a UTF8 string */
int cnt = 0;
char* b = 0;
char* e = 0;
int l = 0;
if(!ch || begin < 0 || count <= 0 || !o)
return -1;
while(*ch) {
int t = mb_uu_ischar(ch);
if(t <= 0) return t;
if(cnt == begin) {
b = ch;
break;
}
ch += t;
cnt++;
}
while(*ch) {
int t = mb_uu_ischar(ch);
if(t <= 0) return t;
if(cnt == begin + count) {
e = ch;
break;
}
ch += t;
e = ch;
cnt++;
}
l = e - b;
*o = (char*)mb_malloc(l + 1);
memcpy(*o, b, l);
(*o)[l] = '\0';
return l;
}
#endif /* MB_ENABLE_UNICODE */
/** Expression processing */ /** Expression processing */
bool_t _is_operator(mb_func_t op) { bool_t _is_operator(mb_func_t op) {
/* Determine whether a function is an operator */ /* Determine whether a function is an operator */
@ -2250,7 +2373,9 @@ char* _load_file(const char* f, const char* prefix) {
bool_t _is_blank(char c) { bool_t _is_blank(char c) {
/* Determine whether a character is a blank */ /* Determine whether a character is a blank */
return (' ' == c) || ('\t' == c); return (' ' == c) || ('\t' == c) ||
(-17 == c) || (-69 == c) || (-65 == c) ||
(-2 == c) || (-1 == c);
} }
bool_t _is_newline(char c) { bool_t _is_newline(char c) {
@ -5561,7 +5686,7 @@ int _core_add(mb_interpreter_t* s, void** l) {
if(_is_string(((_tuple3_t*)(*l))->e1) && _is_string(((_tuple3_t*)(*l))->e2)) { if(_is_string(((_tuple3_t*)(*l))->e1) && _is_string(((_tuple3_t*)(*l))->e2)) {
_instruct_connect_strings(l); _instruct_connect_strings(l);
} else { } else {
_handle_error_on_obj(s, SE_RN_STRING_EXPECTED, 0, (l && *l) ? ((_object_t*)(((_tuple3_t*)(*l))->e1)) : 0, MB_FUNC_ERR, _exit, result); _handle_error_on_obj(s, SE_RN_STRING_EXPECTED, 0, TON(l), MB_FUNC_ERR, _exit, result);
} }
} else { } else {
_instruct_num_op_num(+, l); _instruct_num_op_num(+, l);
@ -5686,7 +5811,7 @@ int _core_neg(mb_interpreter_t* s, void** l) {
break; break;
default: default:
_handle_error_on_obj(s, SE_RN_NUMBER_EXPECTED, 0, (l && *l) ? ((_object_t*)(((_tuple3_t*)(*l))->e1)) : 0, MB_FUNC_WARNING, _exit, result); _handle_error_on_obj(s, SE_RN_NUMBER_EXPECTED, 0, TON(l), MB_FUNC_WARNING, _exit, result);
break; break;
} }
@ -5709,7 +5834,7 @@ int _core_equal(mb_interpreter_t* s, void** l) {
_instruct_compare_strings(==, l); _instruct_compare_strings(==, l);
} else { } else {
_set_tuple3_result(l, 0); _set_tuple3_result(l, 0);
_handle_error_on_obj(s, SE_RN_STRING_EXPECTED, 0, (l && *l) ? ((_object_t*)(((_tuple3_t*)(*l))->e1)) : 0, MB_FUNC_WARNING, _exit, result); _handle_error_on_obj(s, SE_RN_STRING_EXPECTED, 0, TON(l), MB_FUNC_WARNING, _exit, result);
} }
} else { } else {
_instruct_num_op_num(==, l); _instruct_num_op_num(==, l);
@ -5740,7 +5865,7 @@ int _core_less(mb_interpreter_t* s, void** l) {
} else { } else {
_set_tuple3_result(l, 1); _set_tuple3_result(l, 1);
} }
_handle_error_on_obj(s, SE_RN_STRING_EXPECTED, 0, (l && *l) ? ((_object_t*)(((_tuple3_t*)(*l))->e1)) : 0, MB_FUNC_WARNING, _exit, result); _handle_error_on_obj(s, SE_RN_STRING_EXPECTED, 0, TON(l), MB_FUNC_WARNING, _exit, result);
} }
} else { } else {
_instruct_num_op_num(<, l); _instruct_num_op_num(<, l);
@ -5771,7 +5896,7 @@ int _core_greater(mb_interpreter_t* s, void** l) {
} else { } else {
_set_tuple3_result(l, 0); _set_tuple3_result(l, 0);
} }
_handle_error_on_obj(s, SE_RN_STRING_EXPECTED, 0, (l && *l) ? ((_object_t*)(((_tuple3_t*)(*l))->e1)) : 0, MB_FUNC_WARNING, _exit, result); _handle_error_on_obj(s, SE_RN_STRING_EXPECTED, 0, TON(l), MB_FUNC_WARNING, _exit, result);
} }
} else { } else {
_instruct_num_op_num(>, l); _instruct_num_op_num(>, l);
@ -5802,7 +5927,7 @@ int _core_less_equal(mb_interpreter_t* s, void** l) {
} else { } else {
_set_tuple3_result(l, 1); _set_tuple3_result(l, 1);
} }
_handle_error_on_obj(s, SE_RN_STRING_EXPECTED, 0, (l && *l) ? ((_object_t*)(((_tuple3_t*)(*l))->e1)) : 0, MB_FUNC_WARNING, _exit, result); _handle_error_on_obj(s, SE_RN_STRING_EXPECTED, 0, TON(l), MB_FUNC_WARNING, _exit, result);
} }
} else { } else {
_instruct_num_op_num(<=, l); _instruct_num_op_num(<=, l);
@ -5833,7 +5958,7 @@ int _core_greater_equal(mb_interpreter_t* s, void** l) {
} else { } else {
_set_tuple3_result(l, 0); _set_tuple3_result(l, 0);
} }
_handle_error_on_obj(s, SE_RN_STRING_EXPECTED, 0, (l && *l) ? ((_object_t*)(((_tuple3_t*)(*l))->e1)) : 0, MB_FUNC_WARNING, _exit, result); _handle_error_on_obj(s, SE_RN_STRING_EXPECTED, 0, TON(l), MB_FUNC_WARNING, _exit, result);
} }
} else { } else {
_instruct_num_op_num(>=, l); _instruct_num_op_num(>=, l);
@ -5860,7 +5985,7 @@ int _core_not_equal(mb_interpreter_t* s, void** l) {
_instruct_compare_strings(!=, l); _instruct_compare_strings(!=, l);
} else { } else {
_set_tuple3_result(l, 1); _set_tuple3_result(l, 1);
_handle_error_on_obj(s, SE_RN_STRING_EXPECTED, 0, (l && *l) ? ((_object_t*)(((_tuple3_t*)(*l))->e1)) : 0, MB_FUNC_WARNING, _exit, result); _handle_error_on_obj(s, SE_RN_STRING_EXPECTED, 0, TON(l), MB_FUNC_WARNING, _exit, result);
} }
} else { } else {
_instruct_num_op_num(!=, l); _instruct_num_op_num(!=, l);
@ -6908,7 +7033,7 @@ int _std_abs(mb_interpreter_t* s, void** l) {
break; break;
default: default:
_handle_error_on_obj(s, SE_RN_NUMBER_EXPECTED, 0, (l && *l) ? ((_object_t*)(((_tuple3_t*)(*l))->e1)) : 0, MB_FUNC_WARNING, _exit, result); _handle_error_on_obj(s, SE_RN_NUMBER_EXPECTED, 0, TON(l), MB_FUNC_WARNING, _exit, result);
break; break;
} }
@ -6943,7 +7068,7 @@ int _std_sgn(mb_interpreter_t* s, void** l) {
break; break;
default: default:
_handle_error_on_obj(s, SE_RN_NUMBER_EXPECTED, 0, (l && *l) ? ((_object_t*)(((_tuple3_t*)(*l))->e1)) : 0, MB_FUNC_WARNING, _exit, result); _handle_error_on_obj(s, SE_RN_NUMBER_EXPECTED, 0, TON(l), MB_FUNC_WARNING, _exit, result);
break; break;
} }
@ -6998,7 +7123,7 @@ int _std_floor(mb_interpreter_t* s, void** l) {
break; break;
default: default:
_handle_error_on_obj(s, SE_RN_NUMBER_EXPECTED, 0, (l && *l) ? ((_object_t*)(((_tuple3_t*)(*l))->e1)) : 0, MB_FUNC_WARNING, _exit, result); _handle_error_on_obj(s, SE_RN_NUMBER_EXPECTED, 0, TON(l), MB_FUNC_WARNING, _exit, result);
break; break;
} }
@ -7032,7 +7157,7 @@ int _std_ceil(mb_interpreter_t* s, void** l) {
break; break;
default: default:
_handle_error_on_obj(s, SE_RN_NUMBER_EXPECTED, 0, (l && *l) ? ((_object_t*)(((_tuple3_t*)(*l))->e1)) : 0, MB_FUNC_WARNING, _exit, result); _handle_error_on_obj(s, SE_RN_NUMBER_EXPECTED, 0, TON(l), MB_FUNC_WARNING, _exit, result);
break; break;
} }
@ -7066,7 +7191,7 @@ int _std_fix(mb_interpreter_t* s, void** l) {
break; break;
default: default:
_handle_error_on_obj(s, SE_RN_NUMBER_EXPECTED, 0, (l && *l) ? ((_object_t*)(((_tuple3_t*)(*l))->e1)) : 0, MB_FUNC_WARNING, _exit, result); _handle_error_on_obj(s, SE_RN_NUMBER_EXPECTED, 0, TON(l), MB_FUNC_WARNING, _exit, result);
break; break;
} }
@ -7100,7 +7225,7 @@ int _std_round(mb_interpreter_t* s, void** l) {
break; break;
default: default:
_handle_error_on_obj(s, SE_RN_NUMBER_EXPECTED, 0, (l && *l) ? ((_object_t*)(((_tuple3_t*)(*l))->e1)) : 0, MB_FUNC_WARNING, _exit, result); _handle_error_on_obj(s, SE_RN_NUMBER_EXPECTED, 0, TON(l), MB_FUNC_WARNING, _exit, result);
break; break;
} }
@ -7298,6 +7423,7 @@ int _std_asc(mb_interpreter_t* s, void** l) {
/* Get the ASCII code of a character */ /* Get the ASCII code of a character */
int result = MB_FUNC_OK; int result = MB_FUNC_OK;
char* arg = 0; char* arg = 0;
int_t val = 0;
mb_assert(s && l); mb_assert(s && l);
@ -7312,7 +7438,8 @@ int _std_asc(mb_interpreter_t* s, void** l) {
goto _exit; goto _exit;
} }
mb_check(mb_push_int(s, l, (int_t)arg[0])); memcpy(&val, arg, strlen(arg));
mb_check(mb_push_int(s, l, val));
_exit: _exit:
return result; return result;
@ -7332,9 +7459,9 @@ int _std_chr(mb_interpreter_t* s, void** l) {
mb_check(mb_attempt_close_bracket(s, l)); mb_check(mb_attempt_close_bracket(s, l));
chr = (char*)mb_malloc(2); chr = (char*)mb_malloc(sizeof(arg) + 1);
memset(chr, 0, 2); memset(chr, 0, sizeof(arg) + 1);
chr[0] = (char)arg; memcpy(chr, &arg, sizeof(arg));
mb_check(mb_push_string(s, l, chr)); mb_check(mb_push_string(s, l, chr));
_mark_lazy_destroy_string(s, chr); _mark_lazy_destroy_string(s, chr);
@ -7363,9 +7490,15 @@ int _std_left(mb_interpreter_t* s, void** l) {
goto _exit; goto _exit;
} }
#ifdef MB_ENABLE_UNICODE
if(mb_uu_substr(arg, 0, count, &sub) <= 0) {
_handle_error_on_obj(s, SE_RN_INVALID_STRING, 0, TON(l), MB_FUNC_ERR, _exit, result);
}
#else /* MB_ENABLE_UNICODE */
sub = (char*)mb_malloc(count + 1); sub = (char*)mb_malloc(count + 1);
memcpy(sub, arg, count); memcpy(sub, arg, count);
sub[count] = '\0'; sub[count] = '\0';
#endif /* MB_ENABLE_UNICODE */
mb_check(mb_push_string(s, l, sub)); mb_check(mb_push_string(s, l, sub));
_mark_lazy_destroy_string(s, sub); _mark_lazy_destroy_string(s, sub);
@ -7397,9 +7530,15 @@ int _std_mid(mb_interpreter_t* s, void** l) {
goto _exit; goto _exit;
} }
#ifdef MB_ENABLE_UNICODE
if(mb_uu_substr(arg + start, 0, count, &sub) <= 0) {
_handle_error_on_obj(s, SE_RN_INVALID_STRING, 0, TON(l), MB_FUNC_ERR, _exit, result);
}
#else /* MB_ENABLE_UNICODE */
sub = (char*)mb_malloc(count + 1); sub = (char*)mb_malloc(count + 1);
memcpy(sub, arg + start, count); memcpy(sub, arg + start, count);
sub[count] = '\0'; sub[count] = '\0';
#endif /* MB_ENABLE_UNICODE */
mb_check(mb_push_string(s, l, sub)); mb_check(mb_push_string(s, l, sub));
_mark_lazy_destroy_string(s, sub); _mark_lazy_destroy_string(s, sub);
@ -7429,9 +7568,15 @@ int _std_right(mb_interpreter_t* s, void** l) {
goto _exit; goto _exit;
} }
#ifdef MB_ENABLE_UNICODE
if(mb_uu_substr(arg + (mb_uu_strlen(arg) - count), 0, count, &sub) <= 0) {
_handle_error_on_obj(s, SE_RN_INVALID_STRING, 0, TON(l), MB_FUNC_ERR, _exit, result);
}
#else /* MB_ENABLE_UNICODE */
sub = (char*)mb_malloc(count + 1); sub = (char*)mb_malloc(count + 1);
memcpy(sub, arg + (strlen(arg) - count), count); memcpy(sub, arg + (strlen(arg) - count), count);
sub[count] = '\0'; sub[count] = '\0';
#endif /* MB_ENABLE_UNICODE */
mb_check(mb_push_string(s, l, sub)); mb_check(mb_push_string(s, l, sub));
_mark_lazy_destroy_string(s, sub); _mark_lazy_destroy_string(s, sub);
@ -7525,7 +7670,11 @@ int _std_len(mb_interpreter_t* s, void** l) {
switch(arg.type) { switch(arg.type) {
case MB_DT_STRING: case MB_DT_STRING:
#ifdef MB_ENABLE_UNICODE
mb_check(mb_push_int(s, l, (int_t)mb_uu_strlen(arg.value.string)));
#else /* MB_ENABLE_UNICODE */
mb_check(mb_push_int(s, l, (int_t)strlen(arg.value.string))); mb_check(mb_push_int(s, l, (int_t)strlen(arg.value.string)));
#endif /* MB_ENABLE_UNICODE */
break; break;
case MB_DT_ARRAY: case MB_DT_ARRAY: