-- token.e -- a really weird, fast, maybe-inefficient tokenizer -- in a remarkably small amount of code (not so small anymore, eh?) include graphics.e constant PARENT_SET = #1000000 -- this is a hack to make escape chars work sequence error_msg global integer token_type, abort_on_error error_msg = "" abort_on_error = 1 global function accept(sequence text, integer index, sequence set) -- returns the last index position of the accepted text object o, parent token_type = 0 while index <= length(text) do o = set[text[index]] if sequence(o) then parent = set set = o if length(set) != 255 then error_msg = set return index end if elsif o < 0 then if o = -PARENT_SET then set = parent index -= 1 set[text[index]] = 0 exit end if token_type = -o return index+1 elsif o then if o = PARENT_SET then set = parent else token_type = o end if else exit end if index += 1 end while if not token_type then for i = 1 to length(set) do if integer(set[i]) and set[i] then token_type = set[i] if token_type < 0 then token_type = -token_type end if return index end if end for if not token_type then error_msg = set['\n'] return index end if end if return index end function function group_set(sequence group, object default) sequence set,list object o set = repeat(default, 255) for g = 1 to length(group) by 2 do list = group[g] o = group[g+1] for i = 1 to length(list) do if list[i] = '~' then set[list[i-1] .. list[i+1]] = repeat(o, 1+list[i+1]-list[i-1]) else set[list[i]] = o end if end for end for return set end function function build_set(sequence list, object o) return group_set({list, o}, 0) end function global constant -- these are color-coded for fun WHITESPACE = 7, IDENTIFIER = 15, NUMERIC = 10, COMMENT = 7, OPERATOR = 9, STRING = 11 --constant t0=time() constant -- these data structures took 3 months to perfect. DON'T TOUCH! NUM = build_set("0~9", NUMERIC), EXP = build_set("eE", group_set({"0~9+-", NUM}, "exponent not formed correctly")), FRC = build_set(".", NUM + EXP + build_set(".", -PARENT_SET)), ESC = group_set({"nrt\"\'\\", PARENT_SET}, "unknown escape character"), SET = group_set({ " \n\r\t", build_set(" \n\r\t", WHITESPACE), "A~Za~z", build_set("A~Za~z_0~9", IDENTIFIER), "0~9", NUM + EXP + FRC, "#", build_set("0~9A~Fa~f", NUMERIC), "-", group_set({"=", -OPERATOR, "-", group_set({"\n", 0},COMMENT)},0), "+*/&!<>", build_set("=", -OPERATOR), "=()[]{,}?", -OPERATOR, ".", group_set({".", -OPERATOR, "0~9", NUM + EXP}, 0), "\"", group_set({"\\", ESC, "\"", -STRING, "\n", "end of line reached with no closing \""}, STRING), "\'", group_set({"\\", ESC, "\'", -STRING, "\n", "end of line reached with no closing \'"}, STRING) },0) --? time()-t0 procedure print_error_msg(sequence msg, sequence text, integer index) integer line, first, last first = 1 line = 1 for i = 1 to index-1 do if text[i] = '\n' then first = i+1 line += 1 end if end for text &= '\n' last = find('\n', text[first..length(text)]) + first-1 printf(2, ":%d\n%s\n%s%s^\n", {line, msg, text[first..last], repeat(' ', index-first)}) if abort_on_error then abort(1) else error_msg = "" end if end procedure global function tokenize(sequence text) integer index, new_index sequence result index = 1 result = {} while index <= length(text) do new_index = accept(text, index, SET) if length(error_msg) then print_error_msg(error_msg, text, new_index) end if if index = new_index then --exit new_index = index+1 end if result &= {text[index..new_index-1]} index = new_index end while return result end function global sequence token, token_text, ignored_tokens global integer token_index, token_echo ignored_tokens = {} token_echo = 0 global procedure token_init(sequence text) token_text = text token_index = 1 end procedure global procedure next_token() integer new_index if token_index <= length(token_text) then new_index = accept(token_text, token_index, SET) if length(error_msg) then print_error_msg(error_msg, token_text, new_index) end if if token_index = new_index then --exit new_index = token_index+1 end if token = token_text[token_index..new_index-1] token_index = new_index if token_echo then if token_type then text_color(token_type) else text_color(8) end if puts(1, token) end if if find(token_type, ignored_tokens) then next_token() end if else token_type = 0 token = "" end if end procedure global procedure syntax_error(sequence msg) print_error_msg(msg, token_text, token_index-length(token)) end procedure --global procedure back_token() -- token_index -= length(token) -- token = "" -- token_type = 0 --end procedure global sequence last_token global function test(object expected) if equal(token, expected) or equal(token_type, expected) then last_token = token next_token() return 1 end if return 0 end function global procedure expect(object expected) if not test(expected) then syntax_error("Expected: "&expected) end if end procedure --global procedure expect2(object e1, object e2) -- expect(e1) -- expect(e2) --end procedure