/* * phc -- the open source PHP compiler * See license/README.license for licensing information * * Lexical analyser * * NOTE: * In PHP, the constant "01090" is taken to be an octal number; everything * from the 9 onwards is silently ignored (so, the number evaluates to 8 * decimal). phc will give an error message instead. */ %{ #include #include #include #include "generated/php_parser.tab.hpp" #include "generated/cmdline.h" #include "parsing/PHP_lexer.h" extern struct gengetopt_args_info args_info; extern "C" const struct keyword* in_word_set(const char*, unsigned int); #define yywrap() 1 #define YY_SKIP_YYWRAP /* * Macros to return a token * If dump_tokens_flag is set, also print the token to stdout */ #define RETURN(x) { \ if(args_info.dump_tokens_flag) \ printf("%ld: " #x "\n", source_line); \ after_arrow = (x) == O_SINGLEARROW; \ return x; } #define RETURN_OP(x) { \ if(args_info.dump_tokens_flag) \ printf("%ld: SIMPLE_OP %c\n", source_line, x); \ after_arrow = false; \ return x; } #define RETURN_ALL(state) \ mt_final_state = state; \ mt_index = 1; \ BEGIN(RET_MULTI); \ semantic_value = mt_lval[0]; \ RETURN(mt_type[0]); struct keyword { char* name; int token; }; %} %option c++ %option yyclass="PHP_lexer" /* Define lexical states */ %x PHP %x SQ_STR %x SQ_ESC %x BT_STR %x DQ_STR %x HD_STR %x HD_NL %x HD_MAIN %x HD_END %x ESCAPE %x ML_COMM %x SL_COMM %x COMPLEX1 %x COMPLEX2 %x RET_MULTI /* Define a few tokens referenced in the grammar, below */ NL \r?\n? WS [ \t\n\r] ANY [\x00-\xff] START ""{NL}? IDENT [a-zA-Z_\x7F-\xFF][a-zA-Z0-9_\x7F-\xFF]* DEC ([1-9][0-9]*)|0 HEX 0[xX][0-9a-fA-F]+ OCT 0[0-7]+ INT ({DEC}|{HEX}|{OCT}) LNUM [0-9]+ DNUM ([0-9]*[\.]{LNUM})|({LNUM}[\.][0-9]*) EXPONENT_DNUM (({LNUM}|{DNUM})[eE][+-]?{LNUM}) REAL {DNUM}|{EXPONENT_DNUM} BRACKET [(){}[\]] ARITHMETIC [+\-/*%^] BITWISE [&|~] RELATIONAL [=><] OTHER_OP [.!,?:$@] SIMPLE_OP {BRACKET}|{ARITHMETIC}|{BITWISE}|{RELATIONAL}|{OTHER_OP} CS "("{WS}* CE {WS}*")" INT_CAST {CS}("integer"|"int"){CE} REAL_CAST {CS}("float"|"real"|"double"){CE} STRING_CAST {CS}"string"{CE} ARRAY_CAST {CS}"array"{CE} OBJECT_CAST {CS}"object"{CE} BOOL_CAST {CS}("bool"|"boolean"){CE} UNSET_CAST {CS}"unset"{CE} XML_NAME [a-zA-Z_\x7F-\xFF][a-zA-Z0-9_\.\-\x7F-\xFF]* %% /* Update source_line, source_column */ <*>{NL} { if(YY_START != SL_COMM && YY_START != ML_COMM) attach_to_previous = 0; source_line++; source_column = 0; REJECT; } <*>. { if((*yytext != '\n') && (*yytext != '\r')) source_column++; REJECT; } /* Casts */ {INT_CAST} { RETURN(CAST_INT); } {REAL_CAST} { RETURN(CAST_REAL); } {STRING_CAST} { RETURN(CAST_STRING); } {ARRAY_CAST} { RETURN(CAST_ARRAY); } {OBJECT_CAST} { RETURN(CAST_OBJECT); } {BOOL_CAST} { RETURN(CAST_BOOL); } {UNSET_CAST} { RETURN(CAST_UNSET); } /* Operators */ "==" { RETURN(O_EQEQ); } "===" { RETURN(O_EQEQEQ); } "!=" { RETURN(O_NOTEQ); } "<>" { RETURN(O_NOTEQ); } "!==" { RETURN(O_NOTEQEQ); } "<=" { RETURN(O_LE); } ">=" { RETURN(O_GE); } "++" { RETURN(O_INC); } "--" { RETURN(O_DEC); } "=>" { RETURN(O_DOUBLEARROW); } "->" { RETURN(O_SINGLEARROW); } "<<" { RETURN(O_SL); } ">>" { RETURN(O_SR); } "+=" { RETURN(O_PLUSEQ); } "-=" { RETURN(O_MINUSEQ); } "*=" { RETURN(O_MULEQ); } "/=" { RETURN(O_DIVEQ); } ".=" { RETURN(O_CONCATEQ); } "%=" { RETURN(O_MODEQ); } "&=" { RETURN(O_ANDEQ); } "|=" { RETURN(O_OREQ); } "^=" { RETURN(O_XOREQ); } "<<=" { RETURN(O_SLEQ); } ">>=" { RETURN(O_SREQ); } "::" { RETURN(O_COLONCOLON); } "&&" { RETURN(O_LOGICAND); } "||" { RETURN(O_LOGICOR); } {SIMPLE_OP} { RETURN_OP(*yytext); } ";" { attach_to_previous = true; RETURN_OP(';'); } /* Tokens */ ${IDENT} { // variable names do not contain $ semantic_value = new String(yytext+1); RETURN(VARIABLE); } {IDENT} %{ { // Can't declare local variables without scoping them // We generate a semantic value which equals the // keyword so that we can reproduce it exactly the // same way in the unparsers, if we so desire // (keywords are case insensitive) semantic_value = new String(yytext); // Check if the ident is in fact a keyword const struct keyword* keyword; keyword = in_word_set(yytext, yyleng); if(keyword != 0 && !after_arrow) { switch(keyword->token) { case K_CLASS: case K_FUNCTION: attach_to_previous = 1; break; } RETURN(keyword->token); } else { RETURN(IDENT); } } %} {XML_NAME} { semantic_value = new String(yytext); RETURN(XML_IDENT);} {INT} { semantic_value = new String(yytext); RETURN(INT); } {REAL} { semantic_value = new String(yytext); RETURN(REAL); } {STOP} { buffer = ""; BEGIN(INITIAL); RETURN(';'); } /* Strings */ "'" { buffer = ""; BEGIN(SQ_STR); } "`" { buffer = ""; BEGIN(BT_STR); } "\"" { buffer = ""; BEGIN(DQ_STR); } "<<<"" "? { buffer = ""; BEGIN(HD_STR); } /* Comments */ "/*" { buffer = yytext; BEGIN(ML_COMM); } #|"//" { buffer = yytext; BEGIN(SL_COMM); } "*/" { buffer.append(yytext); if(attach_to_previous) attach_comment(new String(buffer)); else last_comments.push_back(new String(buffer)); BEGIN(PHP); buffer = ""; } {ANY} { buffer.push_back(*yytext); } {NL} { if(attach_to_previous) attach_comment(new String(buffer)); else last_comments.push_back(new String(buffer)); attach_to_previous = 0; BEGIN(PHP); buffer = ""; } {STOP} { buffer = ""; BEGIN(INITIAL); } . { buffer.push_back(*yytext); } /* Any other character */ {WS} /* Ignore */ {ANY} { RETURN(INVALID_TOKEN); } /* Deal with singly quoted strings */ \' { semantic_value = new String(buffer); BEGIN(PHP); buffer = ""; RETURN(STRING); } \\ { BEGIN(SQ_ESC); } {ANY} { buffer.push_back(*yytext); } \' { buffer.push_back(*yytext); BEGIN(SQ_STR); } {ANY} { buffer.push_back('\\'); buffer.push_back(*yytext); BEGIN(SQ_STR); } /* Deal with backticked strings. */ \` { schedule_return(IDENT, "shell_exec"); schedule_return('('); schedule_return(STRING, buffer); schedule_return(')'); buffer = ""; RETURN_ALL(PHP); } {ANY} { buffer.push_back(*yytext); } /* Deal with in-string syntax (in DQ_STR, and HD_STR) */ "$"{IDENT} { schedule_return(STRING, buffer); schedule_return('.'); schedule_return(VARIABLE, &yytext[1]); schedule_return('.'); buffer = ""; RETURN_ALL(YY_START); } "${"{IDENT}"}" { schedule_return(STRING, buffer); schedule_return('.'); schedule_return(VARIABLE, &yytext[2], yyleng - 3); schedule_return('.'); buffer = ""; RETURN_ALL(YY_START); } "$"{IDENT}"["{INT}"]" %{ { long left, right; left = strchr(yytext, '[') - yytext; right = strchr(yytext, ']') - yytext; schedule_return(STRING, buffer); schedule_return('.'); schedule_return(VARIABLE, &yytext[1], left - 1); schedule_return('['); schedule_return(INT, &yytext[left+1], right - left - 1); schedule_return(']'); schedule_return('.'); buffer = ""; RETURN_ALL(YY_START); } %} "$"{IDENT}"["{IDENT}"]" %{ { long left, right; left = strchr(yytext, '[') - yytext; right = strchr(yytext, ']') - yytext; schedule_return(STRING, buffer); schedule_return('.'); schedule_return(VARIABLE, &yytext[1], left - 1); schedule_return('['); schedule_return(STRING, &yytext[left+1], right - left - 1); schedule_return(']'); schedule_return('.'); buffer = ""; RETURN_ALL(YY_START); } %} "$"{IDENT}"->"{IDENT} %{ { long arrow; arrow = strchr(yytext, '-') - yytext; schedule_return(STRING, buffer); schedule_return('.'); schedule_return(VARIABLE, &yytext[1], arrow - 1); schedule_return(O_SINGLEARROW); schedule_return(IDENT, &yytext[arrow+2]); schedule_return('.'); buffer = ""; RETURN_ALL(YY_START); } %} "{$" { return_state = YY_START; semantic_value = new String(buffer); yyless(1); BEGIN(COMPLEX1); buffer = ""; RETURN(STRING); } n { buffer.push_back('\n'); BEGIN(return_state); } t { buffer.push_back('\t'); BEGIN(return_state); } r { buffer.push_back('\r'); BEGIN(return_state); } \\ { buffer.push_back('\\'); BEGIN(return_state); } \$ { buffer.push_back('$'); BEGIN(return_state); } x[0-9A-Fa-f]{1,2} %{ { char c = (char) strtol(yytext + 1, 0, 16); buffer.push_back(c); BEGIN(return_state); } %} [0-7]{1,3} %{ { char c = (char) strtol(yytext, 0, 8); buffer.push_back(c); BEGIN(return_state); } %} {ANY} { buffer.push_back('\\'); buffer.push_back(*yytext); BEGIN(return_state); } /* Complex syntax */ {ANY} { yyless(0); BEGIN(PHP); semantic_value = new Integer(return_state); RETURN_OP(O_MAGIC_CONCAT); } {ANY} { yyless(0); BEGIN(return_state); RETURN_OP('.'); } /* Deal with (doubly quoted) strings. */ \" { semantic_value = new String(buffer); BEGIN(PHP); buffer = ""; RETURN(STRING); } \\\" { buffer.push_back('"'); } \\ { return_state = YY_START; BEGIN(ESCAPE); } {ANY} { buffer.push_back(*yytext); } /* Heredoc syntax */ {IDENT} { heredoc_id = strdup(yytext); heredoc_id_len = yyleng; heredoc_id_ptr = 0; BEGIN(HD_NL); } . { yyless(0); BEGIN(PHP); RETURN(INVALID_TOKEN); } {NL} { BEGIN(HD_MAIN); } {ANY} { RETURN(INVALID_TOKEN); } \\ { return_state = YY_START; BEGIN(ESCAPE); } {ANY} %{ buffer.push_back(*yytext); if((source_column == 1) && (*yytext == heredoc_id[0])) heredoc_id_ptr = &heredoc_id[1]; else if(heredoc_id_ptr && (*heredoc_id_ptr == *yytext)) heredoc_id_ptr++; else heredoc_id_ptr = 0; if(heredoc_id_ptr - heredoc_id == heredoc_id_len) { BEGIN(HD_END); } %} {NL}|; %{ { // Remove heredoc_id from the buffer long string_len = buffer.size() - heredoc_id_len; // The linebreak of the last line of the HEREDOC // string should also be stripped if(buffer.size() >= 0 && buffer[string_len - 1] == '\n') string_len--; if(buffer.size() >= 0 && buffer[string_len - 1] == '\r') string_len--; // Windows file semantic_value = new String(buffer.substr(0, string_len)); if(yytext[0] == ';') yyless(0); BEGIN(PHP); buffer = ""; RETURN(STRING); } %} . %{ buffer.push_back(*yytext); heredoc_id_ptr = 0; BEGIN(HD_MAIN); %} /* Returning multiple tokens */ {ANY} { yyless(0); if(mt_index == mt_count - 1) { mt_count = 0; BEGIN(mt_final_state); } semantic_value = mt_lval[mt_index]; mt_index++; RETURN(mt_type[mt_index - 1]); } /* Deal with HTML fragments */ {START_ECHO} { // The logic that deals with returning multiple tokens // needs at least two tokens to work with. if(!buffer.empty()) schedule_return(INLINE_HTML, buffer); else schedule_return(';'); schedule_return(K_ECHO); RETURN_ALL(PHP); } {START} %{ BEGIN(PHP); if(!buffer.empty()) { semantic_value = new String(buffer); RETURN(INLINE_HTML); } %} <> %{ if(buffer.empty()) { yyterminate(); } else { semantic_value = new String(buffer); buffer = ""; RETURN(INLINE_HTML); } %} {ANY} { buffer.push_back(*yytext); } %% /** * We need to define this here rather than in PHP_lexer.h because we * need access to the BEGIN and COMPLEX2 macros defined in lex.yy.cc */ YYSTYPE PHP_lexer::get_yylval() { return semantic_value; } PHP_lexer::PHP_lexer(istream* _is) { is = _is; yyin = _is; source_line = 1; attach_to_previous = false; source_column = 0; mt_index = 0; mt_count = 0; last_commented_node = 0; after_arrow = false; } void PHP_lexer::return_to_complex_syntax() { BEGIN(COMPLEX2); } void PHP_lexer::attach_comment(String *s) { assert(last_commented_node); last_commented_node->get_comments()->push_back(s); } void PHP_lexer::schedule_return(long type, const char* lval, long length) { mt_type[mt_count] = type; if(lval) { if(length == -1) mt_lval[mt_count] = new String(lval); else mt_lval[mt_count] = new String(lval, length); } else { mt_lval[mt_count] = NULL; } mt_count++; } void PHP_lexer::schedule_return(long type, string& s) { schedule_return(type, s.c_str(), s.size()); }