/* ********************************* * Simple recursive descent parser * ***********************************/ /* * The output is an AST rather than a parse tree. * * Limitations: This is obviously no production-ready parser. The * lexical analysis is primitive. The error reporting is minimal and * in particular the parser does not make any attempt at recovering * from errors. */ #include #include #include #include using namespace std; /* * Gramar modified for recursive descent parsing (with e standing for * epsilon): ::= ::= { } ::= e | ::= ID ; ::= BASIC ::= e | [ NUM ] ::= e | ::= = ; | IF ( ) | IF ( ) ELSE | WHILE ( ) | ::= ID ::= e | [ ] ::= ::= e | || ::= ::= e | && ::= ::= e | == | != ::= ::= e | <= | >= | > | < ::= ::= e | + | - ::= ::= e | * | / ::= ! | - | ::= ( ) | | NUM | REAL | TRUE | FALSE */ /* * Nonterminals (used to label the nodes in the parse tree): */ enum nonterminal { NT_TERMINAL, NT_PROGRAM, NT_BLOCK, NT_DECLS, NT_DECL, NT_TYPE, NT_TYPECL, NT_STMTS, NT_STMT, NT_LOC, NT_LOCCL, NT_BOOL, NT_BOOLCL, NT_JOIN, NT_JOINCL, NT_EQUALITY, NT_EQUALCL, NT_REL, NT_RELTAIL, NT_EXPR, NT_EXPRCL, NT_TERM, NT_TERMCL, NT_UNARY, NT_FACTOR }; /* * Terminal types: */ enum vocab { VOC_EOS, VOC_OPEN_PAREN, VOC_CLS_PAREN, VOC_OPEN_BRACE, VOC_CLS_BRACE, VOC_OPEN_SQPAR, VOC_CLS_SQPAR, VOC_IF, VOC_ELSE, VOC_WHILE, VOC_ID, VOC_AND, VOC_OR, VOC_ASSIGN, VOC_EQ, VOC_NEQ, VOC_LTEQ, VOC_GTEQ, VOC_LT, VOC_GT, VOC_PLUS, VOC_MINUS, VOC_NUM, VOC_REAL, VOC_TRUE, VOC_FALSE, VOC_NOT, VOC_MUL, VOC_DIV, VOC_BASIC, VOC_SEMICOLON }; /* * Tokens are structures containing the actual string (the lexeme) and * also the type. */ struct token { string str; vocab type; token(string str, vocab tok) : str(str), type(tok) { } token() : type(VOC_EOS) { } }; /* * Nodes in the parse tree. The member variable nt contains the * nonterminal label for the node, or NT_TERMINAL if the node is * labelled with a terminal (or token). For terminal nodes the member * variable term contains the respective token. * * Internal nodes are labelled with nonterminals while leaves are * labelled with either terminals (tokens) or nonterminals that were * rewritten to the empty string. */ class node { token term; nonterminal nt; list children; public: node(nonterminal n) : nt(n) { } node(token tok) : term(tok), nt(NT_TERMINAL) { } ~node(); void addChild(node* child) { children.push_back(child); } bool isToken() const { return nt == NT_TERMINAL; } bool isNonterminal() const { return !isToken(); } void print(unsigned indent) const; private: string printVal() const; string printNonterminal(nonterminal) const; }; node::~node() { for (list::const_iterator i = children.begin(); i != children.end(); ++i) { delete *i; } } /* * Pretty print functions. */ /* * Prints the value stored in the node (either nonterminal or token): */ string node::printVal() const { if (!isNonterminal()) { return term.str; } else { return printNonterminal(nt); } } /* * Provides strings for each nonterminal type: */ string node::printNonterminal(nonterminal nt) const { switch (nt) { case NT_PROGRAM: return "program"; case NT_BLOCK: return "block"; case NT_DECLS: return "decls"; case NT_DECL: return "decl"; case NT_TYPE: return "type"; case NT_TYPECL: return "typecl"; case NT_STMTS: return "stmts"; case NT_STMT: return "stmt"; case NT_LOC: return "loc"; case NT_LOCCL: return "loccl"; case NT_BOOL: return "bool"; case NT_BOOLCL: return "boolcl"; case NT_JOIN: return "join"; case NT_JOINCL: return "joincl"; case NT_EQUALITY: return "equality"; case NT_EQUALCL: return "equalitycl"; case NT_REL: return "rel"; case NT_RELTAIL: return "reltail"; case NT_EXPR: return "expr"; case NT_EXPRCL: return "exprcl"; case NT_TERM: return "term"; case NT_TERMCL: return "termcl"; case NT_UNARY: return "unary"; case NT_FACTOR: return "factor"; default: return "???"; // we will hopefully never reach this... } } /* * Pretty prints the whole parse tree recursively. */ void node::print(unsigned indent = 0) const { string symbol = printVal(); if (isNonterminal()) cout << string(indent, ' ') << "<" << symbol << ">" << '\n'; else cout << string(indent, ' ') << symbol << '\n'; for (list::const_iterator i = children.begin(); i != children.end(); ++i) { node* child = (*i); child->print(indent + 2); } if (isNonterminal()) cout << string(indent, ' ') << "" << '\n'; } /******************** * Lexical analysis * ********************/ /* * Determine whether a string represents a literal floating point. * Source: . */ bool isDouble(string s) { std::istringstream iss(s); double d; char c; return iss >> d && !(iss >> c); } /* * Determine if a string is a literal int. * Source: . */ bool isInteger(const std::string& s) { if (s.empty() || ((!isdigit(s[0])) && (s[0] != '-') && (s[0] != '+'))) return false; char * p; strtol(s.c_str(), &p, 10); return (*p == 0); } /* * Main function for lexical analysis. */ token getToken() { string curr; if ( cin.eof() ) { return token(); // EOS on end of file } cin >> curr; if ( curr == "") { return token(); // EOS on end of file } // Special (fixed) tokens: if ( curr == "(" ) return token(curr, VOC_OPEN_PAREN); if ( curr == ")" ) return token(curr, VOC_CLS_PAREN); if ( curr == "{" ) return token(curr, VOC_OPEN_BRACE); if ( curr == "}" ) return token(curr, VOC_CLS_BRACE); if ( curr == "[" ) return token(curr, VOC_OPEN_SQPAR); if ( curr == "]" ) return token(curr, VOC_CLS_SQPAR); if ( curr == "if" ) return token(curr, VOC_IF); if ( curr == "while" ) return token(curr, VOC_WHILE); if ( curr == "else" ) return token(curr, VOC_ELSE); if ( curr == "&&" ) return token(curr, VOC_AND); if ( curr == "||" ) return token(curr, VOC_OR); if ( curr == "=" ) return token(curr, VOC_ASSIGN); if ( curr == "==" ) return token(curr, VOC_EQ); if ( curr == "!=" ) return token(curr, VOC_NEQ); if ( curr == "<=" ) return token(curr, VOC_LTEQ); if ( curr == ">=" ) return token(curr, VOC_GTEQ); if ( curr == "<" ) return token(curr, VOC_LT); if ( curr == ">" ) return token(curr, VOC_GT); if ( curr == "+" ) return token(curr, VOC_PLUS); if ( curr == "-" ) return token(curr, VOC_MINUS); if ( curr == "true" ) return token(curr, VOC_TRUE); if ( curr == "false" ) return token(curr, VOC_FALSE); if ( curr == "!" ) return token(curr, VOC_NOT); if ( curr == "*" ) return token(curr, VOC_MUL); if ( curr == "/" ) return token(curr, VOC_DIV); if ( curr == ";" ) return token(curr, VOC_SEMICOLON); if ( curr == "int" ) return token(curr, VOC_BASIC); if ( curr == "char" ) return token(curr, VOC_BASIC); if ( curr == "bool" ) return token(curr, VOC_BASIC); if ( curr == "double" ) return token(curr, VOC_BASIC); if (isInteger(curr)) { // Integer literal: return token(curr, VOC_NUM); } if (isDouble(curr)) { // Floating point literal: return token(curr, VOC_REAL); } // Everything else is an ID: return token(curr, VOC_ID); } /**************************** * Recursive descent parser * ****************************/ /* * Current token throughout the parsing process. */ token tok; /* * Compares the current token with the argument. If they are the same * then obtains a new current token and returns. Otherwise interrupts * the parser. */ void mustBe(vocab v) { if (v != tok.type) { cerr << "Unexpected token: " << tok.str << "\n"; exit(1); } else tok = getToken(); } /* * Nonterminal functions as per the grammar given at the beginning of * the file: */ node* Bool(); // forward declarations needed in Factor() node* Loc(); // ::= ( ) // | // | NUM // | REAL // | TRUE // | FALSE node* Factor() { node* current = new node (NT_FACTOR); switch(tok.type) { case VOC_NUM: case VOC_REAL: case VOC_TRUE: case VOC_FALSE: current -> addChild(new node (tok)); tok = getToken(); break; case VOC_ID: current -> addChild(Loc()); break; default: mustBe(VOC_OPEN_PAREN); current -> addChild(Bool()); mustBe(VOC_CLS_PAREN); } return current; } // ::= ! // | - // | node* Unary() { node* current = new node (NT_UNARY); switch(tok.type) { case VOC_NOT: case VOC_MINUS: current -> addChild(new node (tok)); tok = getToken(); current -> addChild(Unary()); break; default: current -> addChild(Factor()); } return current; } // ::= e // | * // | / node* Termcl() { node* current = new node (NT_TERMCL); switch(tok.type) { case VOC_MUL: case VOC_DIV: current -> addChild(new node(tok)); tok = getToken(); current -> addChild(Unary()); current -> addChild(Termcl()); break; default: break; } return current; } // ::= node* Term() { node* current = new node (NT_TERM); current -> addChild(Unary()); current -> addChild(Termcl()); return current; } // ::= e // | + // | - node* Exprcl() { node* current = new node (NT_EXPRCL); switch(tok.type) { case VOC_PLUS: case VOC_MINUS: current -> addChild(new node(tok)); tok = getToken(); current -> addChild(Term()); current -> addChild(Exprcl()); break; default: break; } return current; } // ::= node* Expr() { node* current = new node (NT_EXPR); current -> addChild(Term()); current -> addChild(Exprcl()); return current; } // ::= e // | <= // | >= // | > // | < node* Reltail() { node* current = new node (NT_RELTAIL); switch(tok.type) { case VOC_LTEQ: case VOC_GTEQ: case VOC_LT: case VOC_GT: current -> addChild(new node(tok)); tok = getToken(); current -> addChild(Expr()); break; default: break; } return current; } // ::= node* Rel() { node* current = new node (NT_REL); current -> addChild(Expr()); current -> addChild(Reltail()); return current; } // ::= e // | == // | != node* Equalcl() { node* current = new node (NT_EQUALCL); switch(tok.type) { case VOC_EQ: case VOC_NEQ: current -> addChild(new node(tok)); tok = getToken(); current -> addChild(Rel()); current -> addChild(Equalcl()); break; default: break; } return current; } // ::= node* Equality() { node* current = new node (NT_EQUALITY); current -> addChild(Rel()); current -> addChild(Equalcl()); return current; } // ::= e // | && node* Joincl() { node* current = new node (NT_JOINCL); switch(tok.type) { case VOC_AND: current -> addChild(new node(tok)); tok = getToken(); current -> addChild(Equality()); current -> addChild(Joincl()); break; default: break; } return current; } // ::= node* Join() { node* current = new node (NT_JOIN); current -> addChild(Equality()); current -> addChild(Joincl()); return current; } // ::= e // | || node* Boolcl() { node* current = new node (NT_BOOLCL); switch(tok.type) { case VOC_OR: current -> addChild(new node(tok)); tok = getToken(); current -> addChild(Join()); current -> addChild(Boolcl()); break; default: break; } return current; } // ::= node* Bool() { node* current = new node (NT_BOOL); current -> addChild(Join()); current -> addChild(Boolcl()); return current; } // ::= e // | [ ] node* Loccl() { node* current = new node (NT_LOCCL); switch(tok.type) { case VOC_OPEN_SQPAR: tok = getToken(); current -> addChild(Bool()); mustBe(VOC_CLS_SQPAR); current -> addChild(Loccl()); break; default: break; } return current; } // ::= ID node* Loc() { node* current = new node (NT_LOC); current -> addChild(new node(tok)); mustBe(VOC_ID); current -> addChild(Loccl()); return current; } node* Block(); // Forward declaration for Stmt() // ::= = ; // | IF ( ) // | IF ( ) ELSE // | WHILE ( ) // | node* Stmt() { node* current = new node (NT_STMT); switch(tok.type) { case VOC_IF: current -> addChild(new node(tok)); tok = getToken(); mustBe(VOC_OPEN_PAREN); current -> addChild(Bool()); mustBe(VOC_CLS_PAREN); current -> addChild(Stmt()); if (tok.type == VOC_ELSE) { current -> addChild(new node(tok)); tok = getToken(); current -> addChild(Stmt()); } break; case VOC_WHILE: current -> addChild(new node(tok)); tok = getToken(); mustBe(VOC_OPEN_PAREN); current -> addChild(Bool()); mustBe(VOC_CLS_PAREN); current -> addChild(Stmt()); break; case VOC_OPEN_BRACE: current -> addChild(Block()); break; default: current -> addChild(Loc()); current -> addChild(new node(tok)); mustBe(VOC_ASSIGN); current -> addChild(Bool()); mustBe(VOC_SEMICOLON); break; } return current; } // ::= e // | node* Stmts() { node* current = new node (NT_STMTS); switch(tok.type) { case VOC_CLS_BRACE: // More concise to start with Follow() break; default: current -> addChild(Stmt()); current -> addChild(Stmts()); break; } return current; } // ::= e // | [ NUM ] node* Typecl() { node* current = new node (NT_TYPECL); switch(tok.type) { case VOC_OPEN_SQPAR: tok=getToken(); current -> addChild(new node(tok)); mustBe(VOC_NUM); mustBe(VOC_CLS_SQPAR); current -> addChild(Typecl()); break; default: break; } return current; } // ::= BASIC node* Type() { node* current = new node (NT_TYPE); current -> addChild(new node(tok)); mustBe(VOC_BASIC); current -> addChild(Typecl()); return current; } // ::= ID ; node* Decl() { node* current = new node (NT_DECL); current -> addChild(Type()); current -> addChild(new node(tok)); mustBe(VOC_ID); mustBe(VOC_SEMICOLON); return current; } // ::= e // | // Note: Follow() = First() + Follow() node* Decls() { node* current = new node (NT_DECLS); switch(tok.type) { case VOC_IF: case VOC_WHILE: case VOC_OPEN_BRACE: case VOC_ID: case VOC_CLS_BRACE: break; default: current -> addChild(Decl()); current -> addChild(Decls()); break; } return current; } // ::= { } node* Block() { node* current = new node (NT_BLOCK); mustBe(VOC_OPEN_BRACE); current -> addChild(Decls()); current -> addChild(Stmts()); mustBe(VOC_CLS_BRACE); return current; } // ::= node* Program() { node* current = new node (NT_PROGRAM); current -> addChild(Block()); return current; } int main() { tok = getToken(); node* tree = Program(); // Call the axiom function to parse mustBe(VOC_EOS); // That's how we know we have been successful! if (tree) { tree->print(); delete tree; return 0; } else return 1; }