/** * 61c, Fall 2007 * * You are explicitly allowed to cut and paste any part of this file * into your own solution (this is not plagarism). * * You should fill in the parts where you see "throw new Error()" * */ import java.io.*; import java.util.*; public class Lexer { /** * A simple helper method to check if the subregion of input * starting at offset matches string test. */ private static boolean strcmp(char[] input, int offset, String test) { if (offset+test.length() > input.length) return false; return (new String(input, offset, test.length()).equals(test)); } /** * Assuming that the entire program resides in the char[] called * "input", and is "inputlen" characters long, break the input up * into tokens. * * If the input has "n" tokens, then the first "n" elements of * token_type, token_start, and token_len should be filled in. * For the "i^th" token, token_type[i] should contain the type of * that token (use one of the constants at the end of this file), * token_start[i] should be the offset of the first character of * the token within "input", and token_len[i] should be the * length of the i^th token. * * You may assume that all three arrays are allocated with enough * space. */ public static int lex(char[] input, int inputlen, int[] token_type, int[] token_start, int[] token_len ) throws Exception { // the number of tokens emitted so far int numtokens = 0; // the start of the token currently being parsed int start = 0; // true if we are currently in the middle of a comment boolean in_a_comment = false; // true if we are currently in the middle of a string boolean in_a_string_literal = false; for(int i=0; i= inputlen-1 ? 0 : input[i+1]; // the code below will "continue" if we have NOT reached // the end of a token; otherwise it falls through to code // at the bottom which records the token if (in_a_comment) { throw new Error(); } else if (in_a_string_literal) { throw new Error(); } else if (c >= '0' && c <= '9') { while(i= '0' && input[i+1] <= '9')) i++; token_type[numtokens] = LITERAL_NUMBER; } else { switch(c) { case '#': while(i>")) { token_type[numtokens] = TOKEN_SHIFT_RIGHT; i += ">>".length()-1; break; } if (strcmp(input, i, "++")) { token_type[numtokens] = TOKEN_PLUS_PLUS; i += "++".length()-1; break; } if (strcmp(input, i, "--")) { token_type[numtokens] = TOKEN_MINUS_MINUS; i += "--".length()-1; break; } if (strcmp(input, i, "->")) { token_type[numtokens] = TOKEN_ARROW; i += "->".length()-1; break; } if (strcmp(input, i, "&&")) { token_type[numtokens] = TOKEN_AND_AND; i += "&&".length()-1; break; } if (strcmp(input, i, "||")) { token_type[numtokens] = TOKEN_OR_OR; i += "||".length()-1; break; } if (strcmp(input, i, "<=")) { token_type[numtokens] = TOKEN_LESS_THAN_OR_EQUAL_TO; i += "<=".length()-1; break; } if (strcmp(input, i, "==")) { token_type[numtokens] = TOKEN_EQUAL_EQUAL; i += "==".length()-1; break; } if (strcmp(input, i, "!=")) { token_type[numtokens] = TOKEN_BANG_EQUAL; i += "!=".length()-1; break; } if (strcmp(input, i, ";")) { token_type[numtokens] = TOKEN_SEMICOLON; i += ";".length()-1; break; } if (strcmp(input, i, "{")) { token_type[numtokens] = TOKEN_BRACE_OPEN; i += "{".length()-1; break; } if (strcmp(input, i, "}")) { token_type[numtokens] = TOKEN_BRACE_CLOSE; i += "}".length()-1; break; } if (strcmp(input, i, ",")) { token_type[numtokens] = TOKEN_COMMA; i += ",".length()-1; break; } if (strcmp(input, i, ":")) { token_type[numtokens] = TOKEN_COLON; i += ":".length()-1; break; } if (strcmp(input, i, "=")) { token_type[numtokens] = TOKEN_EQUALS; i += "=".length()-1; break; } if (strcmp(input, i, "(")) { token_type[numtokens] = TOKEN_PARENTHESIS_OPEN; i += "(".length()-1; break; } if (strcmp(input, i, ")")) { token_type[numtokens] = TOKEN_PARENTHESIS_CLOSE; i += ")".length()-1; break; } if (strcmp(input, i, "[")) { token_type[numtokens] = TOKEN_BRACKET_OPEN; i += "[".length()-1; break; } if (strcmp(input, i, "]")) { token_type[numtokens] = TOKEN_BRACKET_CLOSE; i += "]".length()-1; break; } if (strcmp(input, i, ".")) { token_type[numtokens] = TOKEN_PERIOD; i += ".".length()-1; break; } if (strcmp(input, i, "&")) { token_type[numtokens] = TOKEN_AND; i += "&".length()-1; break; } if (strcmp(input, i, "!")) { token_type[numtokens] = TOKEN_BANG; i += "!".length()-1; break; } if (strcmp(input, i, "~")) { token_type[numtokens] = TOKEN_TILDE; i += "~".length()-1; break; } if (strcmp(input, i, "-")) { token_type[numtokens] = TOKEN_MINUS; i += "-".length()-1; break; } if (strcmp(input, i, "+")) { token_type[numtokens] = TOKEN_PLUS; i += "+".length()-1; break; } if (strcmp(input, i, "*")) { token_type[numtokens] = TOKEN_STAR; i += "*".length()-1; break; } if (strcmp(input, i, "/")) { token_type[numtokens] = TOKEN_SLASH; i += "/".length()-1; break; } if (strcmp(input, i, "%")) { token_type[numtokens] = TOKEN_PERCENT; i += "%".length()-1; break; } if (strcmp(input, i, "<")) { token_type[numtokens] = TOKEN_LESS_THAN; i += "<".length()-1; break; } if (strcmp(input, i, ">")) { token_type[numtokens] = TOKEN_GREATER_THAN; i += ">".length()-1; break; } if (strcmp(input, i, "|")) { token_type[numtokens] = TOKEN_OR; i += "|".length()-1; break; } if (strcmp(input, i, "?")) { token_type[numtokens] = TOKEN_QUESTION_MARK; i += "?".length()-1; break; } token_type[numtokens] = IDENTIFIER; throw new Error(); } } } // record the token (assume that token_type is populated already) token_start[numtokens] = start; token_len[numtokens] = (i+1)-start; start = i+1; numtokens++; } return numtokens; } public static String print_token(char[] input, int token_type, int token_start, int token_len) { String name = null; switch(token_type) { case TOKEN_WHITESPACE: name = "TOKEN_WHITESPACE"; break; case TOKEN_COMMENT: name = "TOKEN_COMMENT"; break; case KEYWORD_BREAK: name = "KEYWORD_BREAK"; break; case KEYWORD_CASE: name = "KEYWORD_CASE"; break; case KEYWORD_CHAR: name = "KEYWORD_CHAR"; break; case KEYWORD_CONST: name = "KEYWORD_CONST"; break; case KEYWORD_CONTINUE: name = "KEYWORD_CONTINUE"; break; case KEYWORD_DEFAULT: name = "KEYWORD_DEFAULT"; break; case KEYWORD_DO: name = "KEYWORD_DO"; break; case KEYWORD_DOUBLE: name = "KEYWORD_DOUBLE"; break; case KEYWORD_ELSE: name = "KEYWORD_ELSE"; break; case KEYWORD_FLOAT: name = "KEYWORD_FLOAT"; break; case KEYWORD_FOR: name = "KEYWORD_FOR"; break; case KEYWORD_GOTO: name = "KEYWORD_GOTO"; break; case KEYWORD_IF: name = "KEYWORD_IF"; break; case KEYWORD_LONG: name = "KEYWORD_LONG"; break; case KEYWORD_RETURN: name = "KEYWORD_RETURN"; break; case KEYWORD_SHORT: name = "KEYWORD_SHORT"; break; case KEYWORD_SIGNED: name = "KEYWORD_SIGNED"; break; case KEYWORD_SIZEOF: name = "KEYWORD_SIZEOF"; break; case KEYWORD_STRUCT: name = "KEYWORD_STRUCT"; break; case KEYWORD_SWITCH: name = "KEYWORD_SWITCH"; break; case KEYWORD_TYPEDEF: name = "KEYWORD_TYPEDEF"; break; case KEYWORD_UNION: name = "KEYWORD_UNION"; break; case KEYWORD_UNSIGNED: name = "KEYWORD_UNSIGNED"; break; case KEYWORD_VOID: name = "KEYWORD_VOID"; break; case KEYWORD_WHILE: name = "KEYWORD_WHILE"; break; case LITERAL_NUMBER: name = "LITERAL_NUMBER"; break; case LITERAL_STRING: name = "LITERAL_STRING"; break; case TOKEN_SHIFT_LEFT: name = "TOKEN_SHIFT_LEFT"; break; case TOKEN_SHIFT_RIGHT: name = "TOKEN_SHIFT_RIGHT"; break; case TOKEN_PLUS_PLUS: name = "TOKEN_PLUS_PLUS"; break; case TOKEN_MINUS_MINUS: name = "TOKEN_MINUS_MINUS"; break; case TOKEN_ARROW: name = "TOKEN_ARROW"; break; case TOKEN_AND_AND: name = "TOKEN_AND_AND"; break; case TOKEN_OR_OR: name = "TOKEN_OR_OR"; break; case TOKEN_LESS_THAN_OR_EQUAL_TO: name = "TOKEN_LESS_THAN_OR_EQUAL_TO"; break; case TOKEN_EQUAL_EQUAL: name = "TOKEN_EQUAL_EQUAL"; break; case TOKEN_BANG_EQUAL: name = "TOKEN_BANG_EQUAL"; break; case TOKEN_SEMICOLON: name = "TOKEN_SEMICOLON"; break; case TOKEN_BRACE_OPEN: name = "TOKEN_BRACE_OPEN"; break; case TOKEN_BRACE_CLOSE: name = "TOKEN_BRACE_CLOSE"; break; case TOKEN_COMMA: name = "TOKEN_COMMA"; break; case TOKEN_COLON: name = "TOKEN_COLON"; break; case TOKEN_EQUALS: name = "TOKEN_EQUALS"; break; case TOKEN_PARENTHESIS_OPEN: name = "TOKEN_PARENTHESIS_OPEN"; break; case TOKEN_PARENTHESIS_CLOSE: name = "TOKEN_PARENTHESIS_CLOSE"; break; case TOKEN_BRACKET_OPEN: name = "TOKEN_BRACKET_OPEN"; break; case TOKEN_BRACKET_CLOSE: name = "TOKEN_BRACKET_CLOSE"; break; case TOKEN_PERIOD: name = "TOKEN_PERIOD"; break; case TOKEN_AND: name = "TOKEN_AND"; break; case TOKEN_BANG: name = "TOKEN_BANG"; break; case TOKEN_TILDE: name = "TOKEN_TILDE"; break; case TOKEN_MINUS: name = "TOKEN_MINUS"; break; case TOKEN_PLUS: name = "TOKEN_PLUS"; break; case TOKEN_STAR: name = "TOKEN_STAR"; break; case TOKEN_SLASH: name = "TOKEN_SLASH"; break; case TOKEN_PERCENT: name = "TOKEN_PERCENT"; break; case TOKEN_LESS_THAN: name = "TOKEN_LESS_THAN"; break; case TOKEN_GREATER_THAN: name = "TOKEN_GREATER_THAN"; break; case TOKEN_OR: name = "TOKEN_OR"; break; case TOKEN_QUESTION_MARK: name = "TOKEN_QUESTION_MARK"; break; case IDENTIFIER: name = "IDENTIFIER"; break; case PREPROCESSOR_DIRECTIVE: name = "PREPROCESSOR_DIRECTIVE"; break; default: name = "**UNKNOWN!**"; break; } return name + ": \"" + new String(input, token_start, token_len) + "\""; } //////////////////////////////////////////////////////////////////////////////// // derived from http://www.lysator.liu.se/c/ANSI-C-grammar-l.html public static final int TOKEN_WHITESPACE = 0; public static final int TOKEN_COMMENT = 1; public static final int KEYWORD_BREAK = 100; public static final int KEYWORD_CASE = 101; public static final int KEYWORD_CHAR = 102; public static final int KEYWORD_CONST = 103; public static final int KEYWORD_CONTINUE = 104; public static final int KEYWORD_DEFAULT = 105; public static final int KEYWORD_DO = 106; public static final int KEYWORD_DOUBLE = 107; public static final int KEYWORD_ELSE = 108; public static final int KEYWORD_FLOAT = 109; public static final int KEYWORD_FOR = 110; public static final int KEYWORD_GOTO = 111; public static final int KEYWORD_IF = 112; public static final int KEYWORD_LONG = 113; public static final int KEYWORD_RETURN = 114; public static final int KEYWORD_SHORT = 115; public static final int KEYWORD_SIGNED = 116; public static final int KEYWORD_SIZEOF = 117; public static final int KEYWORD_STRUCT = 118; public static final int KEYWORD_SWITCH = 119; public static final int KEYWORD_TYPEDEF = 120; public static final int KEYWORD_UNION = 121; public static final int KEYWORD_UNSIGNED = 122; public static final int KEYWORD_VOID = 123; public static final int KEYWORD_WHILE = 124; public static final int LITERAL_NUMBER = 200; public static final int LITERAL_STRING = 201; public static final int TOKEN_SHIFT_LEFT = 300; public static final int TOKEN_SHIFT_RIGHT = 301; public static final int TOKEN_PLUS_PLUS = 302; public static final int TOKEN_MINUS_MINUS = 303; public static final int TOKEN_ARROW = 304; public static final int TOKEN_AND_AND = 305; public static final int TOKEN_OR_OR = 306; public static final int TOKEN_LESS_THAN_OR_EQUAL_TO = 307; public static final int TOKEN_EQUAL_EQUAL = 308; public static final int TOKEN_BANG_EQUAL = 309; public static final int TOKEN_SEMICOLON = 400; public static final int TOKEN_BRACE_OPEN = 401; public static final int TOKEN_BRACE_CLOSE = 402; public static final int TOKEN_COMMA = 403; public static final int TOKEN_COLON = 404; public static final int TOKEN_EQUALS = 405; public static final int TOKEN_PARENTHESIS_OPEN = 406; public static final int TOKEN_PARENTHESIS_CLOSE = 407; public static final int TOKEN_BRACKET_OPEN = 408; public static final int TOKEN_BRACKET_CLOSE = 409; public static final int TOKEN_PERIOD = 410; public static final int TOKEN_AND = 411; public static final int TOKEN_BANG = 412; public static final int TOKEN_TILDE = 413; public static final int TOKEN_MINUS = 414; public static final int TOKEN_PLUS = 415; public static final int TOKEN_STAR = 416; public static final int TOKEN_SLASH = 417; public static final int TOKEN_PERCENT = 418; public static final int TOKEN_LESS_THAN = 419; public static final int TOKEN_GREATER_THAN = 420; public static final int TOKEN_OR = 421; public static final int TOKEN_QUESTION_MARK = 422; public static final int IDENTIFIER = 500; public static final int PREPROCESSOR_DIRECTIVE = 600; }