jancy
jancy copied to clipboard
Missing C Runtime Functions
Trying to port lua regular expression patterns to jancy
I needed the following missing C Runtime Functions
that I think would benefit others:
isalpha
iscntrl
isdigit
isgraph
islower
ispunct
isspace
isupper
isalnum
isxdigit
Also missing math
functions and info about limits of library types defined in header <stdint.h>
:
CHAR_BIT
number of bits in a byte
(macro constant)
MB_LEN_MAX
maximum number of bytes in a multibyte character
(macro constant)
CHAR_MIN
minimum value of char
(macro constant)
CHAR_MAX
maximum value of char
(macro constant)
SCHAR_MIN
SHRT_MIN
INT_MIN
LONG_MIN
LLONG_MIN
(C99)
minimum value of signed char, short, int, long and long long respectively
(macro constant)
SCHAR_MAX
SHRT_MAX
INT_MAX
LONG_MAX
LLONG_MAX
(C99)
maximum value of signed char, short, int, long and long long respectively
(macro constant)
UCHAR_MAX
USHRT_MAX
UINT_MAX
ULONG_MAX
ULLONG_MAX
(C99)
maximum value of unsigned char, unsigned short, unsigned int,
unsigned long and unsigned long long respectively
(macro constant)
...
My lua-regex.jnc
so far (to show what I'm trying to achieve) that has this error:
jancy "lua-regex.jnc"
/home/mingo/dev/c/A_programming-languages/jancy_b/lua-regex.jnc(303,50): binary '+' cannot be applied to 'char [9]' and 'char*'
/home/mingo/dev/c/A_programming-languages/jancy_b/lua-regex.jnc(182,48): unexpected 'identifier' in 'literal'
/home/mingo/dev/c/A_programming-languages/jancy_b/lua-regex.jnc(189,36): undeclared identifier 'INT_MAX'
/home/mingo/dev/c/A_programming-languages/jancy_b/lua-regex.jnc(189,36): unable to recover from previous error(s)
/home/mingo/dev/c/A_programming-languages/jancy_b/lua-regex.jnc(42,1172): binary '+' cannot be applied to 'char [2]' and 'char const*'
5 error(s); compilation failed
lua-regex.jnc
/*
** maximum number of captures that a pattern can do during
** pattern-matching. This limit is arbitrary.
*/
alias ptrdiff_t = intptr_t;
exposed enum LuaRegexConsts
{
LUA_REGEX_MAXCAPTURES = 32,
CAP_UNFINISHED = -1,
CAP_POSITION = -2,
L_ESC = '%',
}
struct LuaCapture {
const char *init;
intptr_t len;
};
struct LuaMatchState {
const char *src_init; /* init of source string */
const char *src_end; /* end ('\0') of source string */
const char *p_end; /* end ('\0') of pattern */
size_t start_pos; /* pattern match start position */
size_t end_pos; /* pattern match end position */
const char *error;
int level; /* total number of captures (finished or unfinished) */
LuaCapture capture[LUA_REGEX_MAXCAPTURES];
};
typedef int luaregex_func_param(LuaMatchState *ms, const void *udata, void **b);
/* macro to `unsign' a character */
//#define uchar(c) ((unsigned char)(c))
static unsigned char uchar(char c) {return (unsigned char)(c);};
//static const char L_ESC = '%';
static const char SPECIALS[] = "^$*+?.([%-";
static char *LUA_QL(const char *x) {return "'" + x + "'"; }
static char *LUA_QS() {return LUA_QL("%s");}
static intptr_t posrelat (intptr_t pos, size_t len) {
/* relative string position: negative means back from end */
if (pos < 0) pos += len;
return (pos >= 0) ? pos : 0;
}
static int check_capture_all_closed (LuaMatchState *ms) {
int i;
for(i=0; i<ms->level; ++i){
if(ms->capture[i].len == CAP_UNFINISHED){
ms->error = "unfinished capture";
return 0;
}
}
return 1;
}
static int check_capture_is_closed (LuaMatchState *ms, int l) {
if (l < 0 || l >= ms->level){
ms->error = "invalid capture index";
return 0;
}
if (ms->capture[l].len == CAP_UNFINISHED){
ms->error = "unfinished capture";
return 0;
}
return 1;
}
static int check_capture (LuaMatchState *ms, int *l_out) {
int l;
*l_out -= '1';
l = *l_out;
return check_capture_is_closed(ms, l);
}
static int capture_to_close (LuaMatchState *ms, int *level_out) {
int level = ms->level;
for (level--; level>=0; level--)
if (ms->capture[level].len == CAP_UNFINISHED) {
*level_out = level;
return 1;
}
ms->error = "invalid pattern capture";
return 0;
}
static int classend (LuaMatchState *ms, const char *p, const char **result) {
switch (*p++) {
case L_ESC: {
if (p == ms->p_end){
ms->error = "malformed pattern (ends with " LUA_QL("%%") ")";
return 0;
}
*result = p+1;
return 1;
}
case '[': {
if (*p == '^') p++;
do { /* look for a `]' */
if (p == ms->p_end){
ms->error = "malformed pattern (missing " LUA_QL("]") ")";
return 0;
}
if (*(p++) == L_ESC && p < ms->p_end)
p++; /* skip escapes (e.g. `%]') */
} while (*p != ']');
*result = p+1;
return 1;
}
default: {
*result = p;
return 1;
}
}
}
static int isalpha(int c) { return (c >= 'a' && c <= 'z') || (c <= 'A' && c >= 'Z');}
static int match_class (int c, int cl) {
int res;
switch (tolower(cl)) {
case 'a' : res = isalpha(c); break;
case 'c' : res = iscntrl(c); break;
case 'd' : res = isdigit(c); break;
case 'g' : res = isgraph(c); break;
case 'l' : res = islower(c); break;
case 'p' : res = ispunct(c); break;
case 's' : res = isspace(c); break;
case 'u' : res = isupper(c); break;
case 'w' : res = isalnum(c); break;
case 'x' : res = isxdigit(c); break;
case 'z' : res = (c == 0); break; /* deprecated option */
default: return (cl == c);
}
return (islower(cl) ? res : !res);
}
static int matchbracketclass (int c, const char *p, const char *ec) {
int sig = 1;
if (*(p+1) == '^') {
sig = 0;
p++; /* skip the `^' */
}
while (++p < ec) {
if (*p == L_ESC) {
p++;
if (match_class(c, uchar(*p)))
return sig;
}
else if ((*(p+1) == '-') && (p+2 < ec)) {
p+=2;
if (uchar(*(p-2)) <= c && c <= uchar(*p))
return sig;
}
else if (uchar(*p) == c) return sig;
}
return !sig;
}
static int singlematch (int c, const char *p, const char *ep) {
switch (*p) {
case '.': return 1; /* matches any char */
case L_ESC: return match_class(c, uchar(*(p+1)));
case '[': return matchbracketclass(c, p, ep-1);
default: return (uchar(*p) == c);
}
}
//static const char *match (LuaMatchState *ms, const char *s, const char *p);
//add escape char extension from https://github.com/jcgoble3/lua-matchext
static const char *matchbalance (LuaMatchState *ms, const char *s,
const char *p) {
int escaped = (*(p-1) == 'B'); /* EXT */
if (p >= ms->p_end - 1 - escaped){
ms->error = "malformed pattern "
"(missing arguments to " LUA_QL("%%b") ")";
return null;
}
if (*s != *p) return null;
else {
int b = *p;
int e = *(p + (escaped ? 2 : 1)); /* EXT */
int esc = escaped ? *(p + 1) : INT_MAX; /* EXT */
int cont = 1;
while (++s < ms->src_end) {
if (*s == esc) s++; /* EXT */
else if (*s == e) {
if (--cont == 0) return s+1;
}
else if (*s == b) cont++;
}
}
return null; /* string ends out of balance */
}
static const char *max_expand (LuaMatchState *ms, const char *s,
const char *p, const char *ep) {
ptrdiff_t i = 0; /* counts maximum expand for item */
while ((s+i)<ms->src_end && singlematch(uchar(*(s+i)), p, ep))
i++;
/* keeps trying to match with the maximum repetitions */
while (i>=0) {
const char *res = match(ms, (s+i), ep+1);
if (res) return res;
i--; /* else didn't match; reduce 1 repetition to try again */
}
return null;
}
static const char *min_expand (LuaMatchState *ms, const char *s,
const char *p, const char *ep) {
for (;;) {
const char *res = match(ms, s, ep+1);
if (res != null)
return res;
else if (s<ms->src_end && singlematch(uchar(*s), p, ep))
s++; /* try with one more repetition */
else return null;
}
}
static const char *start_capture (LuaMatchState *ms, const char *s,
const char *p, int what) {
const char *res;
int level = ms->level;
if (level >= LUA_REGEX_MAXCAPTURES) {
ms->error = "too many captures";
return null;
}
ms->capture[level].init = s;
ms->capture[level].len = what;
ms->level = level+1;
if ((res=match(ms, s, p)) == null) /* match failed? */
ms->level--; /* undo capture */
return res;
}
static const char *end_capture (LuaMatchState *ms, const char *s,
const char *p) {
int l;
const char *res;
if(!capture_to_close(ms, &l)) return null;
ms->capture[l].len = s - ms->capture[l].init; /* close capture */
if ((res = match(ms, s, p)) == null) /* match failed? */
ms->capture[l].len = CAP_UNFINISHED; /* undo capture */
return res;
}
static const char *match_capture (LuaMatchState *ms, const char *s, int l) {
size_t len;
if(check_capture(ms, &l)){
len = ms->capture[l].len;
if ((size_t)(ms->src_end-s) >= len &&
memcmp(ms->capture[l].init, s, len) == 0)
return s+len;
}
return null;
}
static const char *match (LuaMatchState *ms, const char *s, const char *p) {
//init: /* using goto's to optimize tail recursion */
for(;;) {
if (p == ms->p_end) /* end of pattern? */
return s; /* match succeeded */
switch (*p) {
case '(': { /* start capture */
if (*(p+1) == ')') /* position capture? */
return start_capture(ms, s, p+2, CAP_POSITION);
else
return start_capture(ms, s, p+1, CAP_UNFINISHED);
}
case ')': { /* end capture */
return end_capture(ms, s, p+1);
}
case '$': {
if ((p+1) == ms->p_end) /* is the `$' the last char in pattern? */
return (s == ms->src_end) ? s : null; /* check end of string */
else break; //goto dflt;
}
case L_ESC: { /* escaped sequences not in the format class[*+?-]? */
switch (*(p+1)) {
case 'b': case 'B': { /* balanced string? */ /* EXT */
s = matchbalance(ms, s, p+2);
if (s == null) return null;
p += (*(p + 1) == 'b') ? 4 : 5; /* EXT */ continue; // goto init; /* else return match(ms, s, p+4); */
}
case 'f': { /* frontier? */
const char *ep; char previous;
p += 2;
if (*p != '['){
ms->error = "missing " + LUA_QL("[") + " after " +
LUA_QL("%%f") + " in pattern";
return null;
}
if(!classend(ms, p, &ep)) return null; /* points to what is next */
previous = (s == ms->src_init) ? '\0' : *(s-1);
if (matchbracketclass(uchar(previous), p, ep-1) ||
!matchbracketclass(uchar(*s), p, ep-1)) return null;
p=ep; continue; //goto init; /* else return match(ms, s, ep); */
}
case '0': case '1': case '2': case '3':
case '4': case '5': case '6': case '7':
case '8': case '9': { /* capture results (%0-%9)? */
s = match_capture(ms, s, uchar(*(p+1)));
if (s == null) return null;
p+=2; continue; //goto init; /* else return match(ms, s, p+2) */
}
//default: goto dflt;
}
}
}
//default: dflt:
{ /* pattern class plus optional suffix */
const char *ep;
int m;
if(!classend(ms, p, &ep)) return null; /* points to what is next */
m = s < ms->src_end && singlematch(uchar(*s), p, ep);
switch (*ep) {
case '?': { /* optional */
const char *res;
if (m && ((res=match(ms, s+1, ep+1)) != null))
return res;
p=ep+1; continue; //goto init; /* else return match(ms, s, ep+1); */
}
case '*': { /* 0 or more repetitions */
return max_expand(ms, s, p, ep);
}
case '+': { /* 1 or more repetitions */
return (m ? max_expand(ms, s+1, p, ep) : null);
}
case '-': { /* 0 or more repetitions (minimum) */
return min_expand(ms, s, p, ep);
}
default: {
if (!m) return null;
s++; p=ep; continue; //goto init; /* else return match(ms, s+1, ep); */
}
}
}
break;
}
}
static const char *lmemfind (const char *s1, size_t l1,
const char *s2, size_t l2) {
if (l2 == 0) return s1; /* empty strings are everywhere */
else if (l2 > l1) return null; /* avoids a negative `l1' */
else {
const char *init; /* to search for a `*s2' inside `s1' */
l2--; /* 1st char will be checked by `memchr' */
l1 = l1-l2; /* `s2' cannot be found after that */
while (l1 > 0 && (init = (const char *)memchr(s1, *s2, l1)) != null) {
init++; /* 1st char is already checked */
if (memcmp(init, s2+1, l2) == 0)
return init-1;
else { /* correct `l1' and `s1' to try again */
l1 -= init-s1;
s1 = init;
}
}
return null; /* not found */
}
}
/* check whether pattern has no special characters */
static int nospecials (const char *p, size_t l) {
size_t upto = 0;
do {
if (strpbrk(p + upto, SPECIALS))
return 0; /* pattern has a special character */
upto += strlen(p + upto) + 1; /* may have more after \0 */
} while (upto <= l);
return 1; /* no special chars found */
}
static ptrdiff_t str_find_aux (LuaMatchState *ms, int find, const char *s, ptrdiff_t ls,
const char *p, ptrdiff_t lp, ptrdiff_t init, int raw_find,
luaregex_func_param *fp, void *udata) {
ptrdiff_t result;
ms->error = null;
if(ls < 0) ls = strlen(s);
assert(ls >= 0);
if(lp < 0) lp = strlen(p);
assert(lp >= 0);
init = posrelat(init, ls);
if (init < 0) init = 0;
else if (init > ls + 1) { /* start after string's end? */
return 0; /* cannot find anything */
}
ms->src_init = s;
ms->src_end = s + ls;
//do_again:
for(;;) {
result = -1; /* not found */
/* explicit request or no special characters? */
if (find && (raw_find || nospecials(p, lp))) {
/* do a plain search */
const char *s2 = lmemfind(s + init, ls - init, p, lp);
if (s2) {
ms->start_pos = ((int)(s2 - s));
result = ms->end_pos = ms->start_pos+lp;
ms->level = 0;
}
}
else {
const char *s1 = s + init;
int anchor = (*p == '^');
if (anchor) {
p++; lp--; /* skip anchor character */
}
ms->p_end = p + lp;
do {
const char *res;
ms->level = 0;
if ((res=match(ms, s1, p)) != null) {
ms->start_pos = s1-s;
result = ms->end_pos = res-s;
break; //goto eofunc;
}
} while (s1++ < ms->src_end && !anchor);
}
//eofunc:
if(result >= 0){
if(!check_capture_all_closed(ms)) return 0;
if(fp && fp(ms, udata, null)) {
init = result;
if (init == ms->start_pos) ++init; /* empty match? go at least one position */
if (init < ls) continue; //goto do_again;
}
}
break;
}
return result > 0 ? ms->start_pos : result; //returning the start position
}
int main ()
{
LuaMatchState ms;
printf ("lua-regex!\n");
printf("%d\n", posrelat(-10, 12));
printf("match_class : %d\n", match_class('f', 'x'));
char const* p1 = " foo bar 100 baz";
const char * p2 = "baz";
const char *found = lmemfind(p1, strlen(p1), p2, strlen(p2));
printf("found : %s\n", found);
ptrdiff_t dt = str_find_aux(&ms, 1, p1, strlen(p1), p2, strlen(p2), 0, 1, null, null);
printf("found : %d\n", dt);
return 0;
}
And here is my initial implementation of isalpha, ...
:
diff --git a/src/jnc_ext/jnc_std/jnc/std_globals.jnc b/src/jnc_ext/jnc_std/jnc/std_globals.jnc
index 951cc88a..079a7e3c 100644
--- a/src/jnc_ext/jnc_std/jnc/std_globals.jnc
+++ b/src/jnc_ext/jnc_std/jnc/std_globals.jnc
@@ -442,6 +442,17 @@ intptr_t cdecl printf(
...
);
+bool isalpha(uint32_t c);
+bool iscntrl(uint32_t c);
+bool isdigit(uint32_t c);
+bool isgraph(uint32_t c);
+bool islower(uint32_t c);
+bool ispunct(uint32_t c);
+bool isspace(uint32_t c);
+bool isupper(uint32_t c);
+bool isalnum(uint32_t c);
+bool isxdigit(uint32_t c);
+
//! @}
namespace std {
diff --git a/src/jnc_ext/jnc_std/jnc_std_StdLib.cpp b/src/jnc_ext/jnc_std/jnc_std_StdLib.cpp
index 92bdfa16..003a871f 100644
--- a/src/jnc_ext/jnc_std/jnc_std_StdLib.cpp
+++ b/src/jnc_ext/jnc_std/jnc_std_StdLib.cpp
@@ -148,6 +148,56 @@ strtoul(
return strtot<uint64_t>(::_strtoui64, ptr, endPtr, radix);
}
+bool isAlpha(uint32_t c)
+{
+ return enc::utfIsLetter(c);
+}
+
+bool isCntrl(uint32_t c)
+{
+ return iscntrl(c);
+}
+
+bool isDigit(uint32_t c)
+{
+ return enc::utfIsDigit(c);
+}
+
+bool isGraph(uint32_t c)
+{
+ return isgraph(c);
+}
+
+bool isLower(uint32_t c)
+{
+ return enc::utfIsLowerCase(c);
+}
+
+bool isPunct(uint32_t c)
+{
+ return enc::utfIsPunctuation(c);
+}
+
+bool isSpace(uint32_t c)
+{
+ return enc::utfIsSpace(c);
+}
+
+bool isUpper(uint32_t c)
+{
+ return enc::utfIsUpperCase(c);
+}
+
+bool isAlnum(uint32_t c)
+{
+ return enc::utfIsLetterOrDigit(c);
+}
+
+bool isXdigit(uint32_t c)
+{
+ return isxdigit(c);
+}
+
uint32_t
toUpper(uint32_t c)
{
@@ -679,6 +729,17 @@ JNC_BEGIN_LIB_FUNCTION_MAP(jnc_StdLib)
JNC_MAP_OVERLOAD(setError_1)
JNC_MAP_FUNCTION("std.format", format)
+ JNC_MAP_FUNCTION("isalpha", isAlpha)
+ JNC_MAP_FUNCTION("iscntrl", isCntrl)
+ JNC_MAP_FUNCTION("isdigit", isDigit)
+ JNC_MAP_FUNCTION("isgraph", isGraph)
+ JNC_MAP_FUNCTION("islower", isLower)
+ JNC_MAP_FUNCTION("ispunct", isPunct)
+ JNC_MAP_FUNCTION("isspace", isSpace)
+ JNC_MAP_FUNCTION("isupper", isUpper)
+ JNC_MAP_FUNCTION("isalnum", isAlnum)
+ JNC_MAP_FUNCTION("isxdigit", isXdigit)
+
JNC_MAP_FUNCTION("strlen", jnc::strLen)
JNC_MAP_FUNCTION("strcmp", strCmp)
JNC_MAP_FUNCTION("strncmp", strnCmp)
My
lua-regex.jnc
so far (to show what I'm trying to achieve) that has this error:jancy "lua-regex.jnc" /home/mingo/dev/c/A_programming-languages/jancy_b/lua-regex.jnc(303,50): binary '+' cannot be applied to 'char [9]' and 'char*' /home/mingo/dev/c/A_programming-languages/jancy_b/lua-regex.jnc(182,48): unexpected 'identifier' in 'literal' /home/mingo/dev/c/A_programming-languages/jancy_b/lua-regex.jnc(189,36): undeclared identifier 'INT_MAX' /home/mingo/dev/c/A_programming-languages/jancy_b/lua-regex.jnc(189,36): unable to recover from previous error(s) /home/mingo/dev/c/A_programming-languages/jancy_b/lua-regex.jnc(42,1172): binary '+' cannot be applied to 'char [2]' and 'char const*' 5 error(s); compilation failed
You are trying to add char pointers/char arrays, and that doesn't work -- just like in C.
For building strings please use:
- formatting literals (https://github.com/vovkos/jancy/blob/master/samples/jnc/61_FormattingLiterals.jnc);
-
std.StringBuilder
(https://github.com/vovkos/jancy/blob/master/src/jnc_ext/jnc_std/jnc/std_String.jnc); - or the good old
strcat
And here is my initial implementation of
isalpha, ...
:
A PR with those standard C runtime functions would be very welcome.