diff --git a/lib/puppet/pops/parser/slurp_support.rb b/lib/puppet/pops/parser/slurp_support.rb index 61d47db2b..fb2af8304 100644 --- a/lib/puppet/pops/parser/slurp_support.rb +++ b/lib/puppet/pops/parser/slurp_support.rb @@ -1,95 +1,95 @@ # This module is an integral part of the Lexer. # It defines the string slurping behavior - finding the string and non string parts in interpolated # strings, translating escape sequences in strings to their single character equivalence. # # PERFORMANCE NOTE: The various kinds of slurping could be made even more generic, but requires # additional parameter passing and evaluation of conditional logic. # TODO: More detailed performance analysis of excessive character escaping and interpolation. # module Puppet::Pops::Parser::SlurpSupport SLURP_SQ_PATTERN = /(?:[^\\]|^|[^\\])(?:[\\]{2})*[']/ SLURP_DQ_PATTERN = /(?:[^\\]|^|[^\\])(?:[\\]{2})*(["]|[$]\{?)/ SLURP_UQ_PATTERN = /(?:[^\\]|^|[^\\])(?:[\\]{2})*([$]\{?|\z)/ SLURP_ALL_PATTERN = /.*(\z)/ - SQ_ESCAPES = %w{ ' } + SQ_ESCAPES = %w{ \\ ' } DQ_ESCAPES = %w{ \\ $ ' " r n t s u}+["\r\n", "\n"] UQ_ESCAPES = %w{ \\ $ r n t s u}+["\r\n", "\n"] def slurp_sqstring # skip the leading ' @scanner.pos += 1 str = slurp(@scanner, SLURP_SQ_PATTERN, SQ_ESCAPES, :ignore_invalid_escapes) || lex_error("Unclosed quote after \"'\" followed by '#{followed_by}'") str[0..-2] # strip closing "'" from result end def slurp_dqstring scn = @scanner last = scn.matched str = slurp(scn, SLURP_DQ_PATTERN, DQ_ESCAPES, false) unless str lex_error("Unclosed quote after #{format_quote(last)} followed by '#{followed_by}'") end # Terminator may be a single char '"', '$', or two characters '${' group match 1 (scn[1]) from the last slurp holds this terminator = scn[1] [str[0..(-1 - terminator.length)], terminator] end # Copy from old lexer - can do much better def slurp_uqstring scn = @scanner last = scn.matched ignore = true str = slurp(scn, @lexing_context[:uq_slurp_pattern], @lexing_context[:escapes], :ignore_invalid_escapes) # Terminator may be a single char '$', two characters '${', or empty string '' at the end of intput. # Group match 1 holds this. # The exceptional case is found by looking at the subgroup 1 of the most recent match made by the scanner (i.e. @scanner[1]). # This is the last match made by the slurp method (having called scan_until on the scanner). # If there is a terminating character is must be stripped and returned separately. # terminator = scn[1] [str[0..(-1 - terminator.length)], terminator] end # Slurps a string from the given scanner until the given pattern and then replaces any escaped # characters given by escapes into their control-character equivalent or in case of line breaks, replaces the # pattern \r?\n with an empty string. # The returned string contains the terminating character. Returns nil if the scanner can not scan until the given # pattern. # def slurp(scanner, pattern, escapes, ignore_invalid_escapes) str = scanner.scan_until(pattern) || return # Process unicode escapes first as they require getting 4 hex digits # If later a \u is found it is warned not to be a unicode escape if escapes.include?('u') str.gsub!(/\\u([\da-fA-F]{4})/m) { [$1.hex].pack("U") } end str.gsub!(/\\([^\r\n]|(?:\r?\n))/m) { ch = $1 if escapes.include? ch case ch when 'r' ; "\r" when 'n' ; "\n" when 't' ; "\t" when 's' ; " " when 'u' Puppet.warning(positioned_message("Unicode escape '\\u' was not followed by 4 hex digits")) "\\u" when "\n" ; '' when "\r\n"; '' else ch end else Puppet.warning(positioned_message("Unrecognized escape sequence '\\#{ch}'")) unless ignore_invalid_escapes "\\#{ch}" end } str end end diff --git a/spec/unit/pops/parser/lexer2_spec.rb b/spec/unit/pops/parser/lexer2_spec.rb index 655b6cf54..ce1f0ab8a 100644 --- a/spec/unit/pops/parser/lexer2_spec.rb +++ b/spec/unit/pops/parser/lexer2_spec.rb @@ -1,400 +1,407 @@ require 'spec_helper' require 'matchers/match_tokens2' require 'puppet/pops' require 'puppet/pops/parser/lexer2' module EgrammarLexer2Spec def tokens_scanned_from(s) lexer = Puppet::Pops::Parser::Lexer2.new lexer.string = s tokens = lexer.fullscan[0..-2] end def epp_tokens_scanned_from(s) lexer = Puppet::Pops::Parser::Lexer2.new lexer.string = s tokens = lexer.fullscan_epp[0..-2] end end describe 'Lexer2' do include EgrammarLexer2Spec { :LBRACK => '[', :RBRACK => ']', :LBRACE => '{', :RBRACE => '}', :LPAREN => '(', :RPAREN => ')', :EQUALS => '=', :ISEQUAL => '==', :GREATEREQUAL => '>=', :GREATERTHAN => '>', :LESSTHAN => '<', :LESSEQUAL => '<=', :NOTEQUAL => '!=', :NOT => '!', :COMMA => ',', :DOT => '.', :COLON => ':', :AT => '@', :LLCOLLECT => '<<|', :RRCOLLECT => '|>>', :LCOLLECT => '<|', :RCOLLECT => '|>', :SEMIC => ';', :QMARK => '?', :OTHER => '\\', :FARROW => '=>', :PARROW => '+>', :APPENDS => '+=', :DELETES => '-=', :PLUS => '+', :MINUS => '-', :DIV => '/', :TIMES => '*', :LSHIFT => '<<', :RSHIFT => '>>', :MATCH => '=~', :NOMATCH => '!~', :IN_EDGE => '->', :OUT_EDGE => '<-', :IN_EDGE_SUB => '~>', :OUT_EDGE_SUB => '<~', :PIPE => '|', }.each do |name, string| it "should lex a token named #{name.to_s}" do tokens_scanned_from(string).should match_tokens2(name) end end { "case" => :CASE, "class" => :CLASS, "default" => :DEFAULT, "define" => :DEFINE, # "import" => :IMPORT, # done as a function in egrammar "if" => :IF, "elsif" => :ELSIF, "else" => :ELSE, "inherits" => :INHERITS, "node" => :NODE, "and" => :AND, "or" => :OR, "undef" => :UNDEF, "false" => :BOOLEAN, "true" => :BOOLEAN, "in" => :IN, "unless" => :UNLESS, }.each do |string, name| it "should lex a keyword from '#{string}'" do tokens_scanned_from(string).should match_tokens2(name) end end # TODO: Complete with all edge cases [ 'A', 'A::B', '::A', '::A::B',].each do |string| it "should lex a CLASSREF on the form '#{string}'" do tokens_scanned_from(string).should match_tokens2([:CLASSREF, string]) end end # TODO: Complete with all edge cases [ 'a', 'a::b', '::a', '::a::b',].each do |string| it "should lex a NAME on the form '#{string}'" do tokens_scanned_from(string).should match_tokens2([:NAME, string]) end end [ 'a-b', 'a--b', 'a-b-c'].each do |string| it "should lex a BARE WORD STRING on the form '#{string}'" do tokens_scanned_from(string).should match_tokens2([:STRING, string]) end end { '-a' => [:MINUS, :NAME], '--a' => [:MINUS, :MINUS, :NAME], 'a-' => [:NAME, :MINUS], 'a- b' => [:NAME, :MINUS, :NAME], 'a--' => [:NAME, :MINUS, :MINUS], 'a-$3' => [:NAME, :MINUS, :VARIABLE], }.each do |source, expected| it "should lex leading and trailing hyphens from #{source}" do tokens_scanned_from(source).should match_tokens2(*expected) end end { 'false'=>false, 'true'=>true}.each do |string, value| it "should lex a BOOLEAN on the form '#{string}'" do tokens_scanned_from(string).should match_tokens2([:BOOLEAN, value]) end end [ '0', '1', '2982383139'].each do |string| it "should lex a decimal integer NUMBER on the form '#{string}'" do tokens_scanned_from(string).should match_tokens2([:NUMBER, string]) end end { ' 1' => '1', '1 ' => '1', ' 1 ' => '1'}.each do |string, value| it "should lex a NUMBER with surrounding space '#{string}'" do tokens_scanned_from(string).should match_tokens2([:NUMBER, value]) end end [ '0.0', '0.1', '0.2982383139', '29823.235', '10e23', '10e-23', '1.234e23'].each do |string| it "should lex a decimal floating point NUMBER on the form '#{string}'" do tokens_scanned_from(string).should match_tokens2([:NUMBER, string]) end end [ '00', '01', '0123', '0777'].each do |string| it "should lex an octal integer NUMBER on the form '#{string}'" do tokens_scanned_from(string).should match_tokens2([:NUMBER, string]) end end [ '0x0', '0x1', '0xa', '0xA', '0xabcdef', '0xABCDEF'].each do |string| it "should lex an hex integer NUMBER on the form '#{string}'" do tokens_scanned_from(string).should match_tokens2([:NUMBER, string]) end end { "''" => '', "'a'" => 'a', "'a\\'b'" =>"a'b", - "'a\\r\\n\\t\\s\\$\\\"\\\\b'" => "a\\r\\n\\t\\s\\$\\\"\\\\b" + "'a\\rb'" =>"a\\rb", + "'a\\nb'" =>"a\\nb", + "'a\\tb'" =>"a\\tb", + "'a\\sb'" =>"a\\sb", + "'a\\$b'" =>"a\\$b", + "'a\\\"b'" =>"a\\\"b", + "'a\\\\b'" =>"a\\b", + "'a\\\\'" =>"a\\", }.each do |source, expected| it "should lex a single quoted STRING on the form #{source}" do tokens_scanned_from(source).should match_tokens2([:STRING, expected]) end end { '""' => '', '"a"' => 'a', '"a\'b"' => "a'b", }.each do |source, expected| it "should lex a double quoted STRING on the form #{source}" do tokens_scanned_from(source).should match_tokens2([:STRING, expected]) end end { '"a$x b"' => [[:DQPRE, 'a', {:line => 1, :pos=>1, :length=>2 }], [:VARIABLE, 'x', {:line => 1, :pos=>3, :length=>2 }], [:DQPOST, ' b', {:line => 1, :pos=>5, :length=>3 }]], '"a$x.b"' => [[:DQPRE, 'a', {:line => 1, :pos=>1, :length=>2 }], [:VARIABLE, 'x', {:line => 1, :pos=>3, :length=>2 }], [:DQPOST, '.b', {:line => 1, :pos=>5, :length=>3 }]], '"$x.b"' => [[:DQPRE, '', {:line => 1, :pos=>1, :length=>1 }], [:VARIABLE, 'x', {:line => 1, :pos=>2, :length=>2 }], [:DQPOST, '.b', {:line => 1, :pos=>4, :length=>3 }]], '"a$x"' => [[:DQPRE, 'a', {:line => 1, :pos=>1, :length=>2 }], [:VARIABLE, 'x', {:line => 1, :pos=>3, :length=>2 }], [:DQPOST, '', {:line => 1, :pos=>5, :length=>1 }]], }.each do |source, expected| it "should lex an interpolated variable 'x' from #{source}" do tokens_scanned_from(source).should match_tokens2(*expected) end end it "differentiates between foo[x] and foo [x] (whitespace)" do tokens_scanned_from("$a[1]").should match_tokens2(:VARIABLE, :LBRACK, :NUMBER, :RBRACK) tokens_scanned_from("$a [1]").should match_tokens2(:VARIABLE, :LBRACK, :NUMBER, :RBRACK) tokens_scanned_from("a[1]").should match_tokens2(:NAME, :LBRACK, :NUMBER, :RBRACK) tokens_scanned_from("a [1]").should match_tokens2(:NAME, :LISTSTART, :NUMBER, :RBRACK) tokens_scanned_from(" if \n\r\t\nif if ").should match_tokens2(:IF, :IF, :IF) end it "skips whitepsace" do tokens_scanned_from(" if if if ").should match_tokens2(:IF, :IF, :IF) tokens_scanned_from(" if \n\r\t\nif if ").should match_tokens2(:IF, :IF, :IF) end it "skips single line comments" do tokens_scanned_from("if # comment\nif").should match_tokens2(:IF, :IF) end ["if /* comment */\nif", "if /* comment\n */\nif", "if /*\n comment\n */\nif", ].each do |source| it "skips multi line comments" do tokens_scanned_from(source).should match_tokens2(:IF, :IF) end end { "=~" => [:MATCH, "=~ /./"], "!~" => [:NOMATCH, "!~ /./"], "," => [:COMMA, ", /./"], "(" => [:LPAREN, "( /./"], "[" => [:LBRACK, "[ /./"], "{" => [:LBRACE, "{ /./"], "+" => [:PLUS, "+ /./"], "-" => [:MINUS, "- /./"], "*" => [:TIMES, "* /./"], ";" => [:SEMIC, "; /./"], }.each do |token, entry| it "should lex regexp after '#{token}'" do tokens_scanned_from(entry[1]).should match_tokens2(entry[0], :REGEX) end end it "should lex a simple expression" do tokens_scanned_from('1 + 1').should match_tokens2([:NUMBER, '1'], :PLUS, [:NUMBER, '1']) end { "1" => ["1 /./", [:NUMBER, :DIV, :DOT, :DIV]], "'a'" => ["'a' /./", [:STRING, :DIV, :DOT, :DIV]], "true" => ["true /./", [:BOOLEAN, :DIV, :DOT, :DIV]], "false" => ["false /./", [:BOOLEAN, :DIV, :DOT, :DIV]], "/./" => ["/./ /./", [:REGEX, :DIV, :DOT, :DIV]], "a" => ["a /./", [:NAME, :DIV, :DOT, :DIV]], "A" => ["A /./", [:CLASSREF, :DIV, :DOT, :DIV]], ")" => [") /./", [:RPAREN, :DIV, :DOT, :DIV]], "]" => ["] /./", [:RBRACK, :DIV, :DOT, :DIV]], "|>" => ["|> /./", [:RCOLLECT, :DIV, :DOT, :DIV]], "|>>" => ["|>> /./", [:RRCOLLECT, :DIV, :DOT, :DIV]], '"a$a"' => ['"a$a" /./', [:DQPRE, :VARIABLE, :DQPOST, :DIV, :DOT, :DIV]], }.each do |token, entry| it "should not lex regexp after '#{token}'" do tokens_scanned_from(entry[ 0 ]).should match_tokens2(*entry[ 1 ]) end end it 'should lex assignment' do tokens_scanned_from("$a = 10").should match_tokens2([:VARIABLE, "a"], :EQUALS, [:NUMBER, '10']) end # TODO: Tricky, and heredoc not supported yet # it "should not lex regexp after heredoc" do # tokens_scanned_from("1 / /./").should match_tokens2(:NUMBER, :DIV, :REGEX) # end it "should lex regexp at beginning of input" do tokens_scanned_from(" /./").should match_tokens2(:REGEX) end it "should lex regexp right of div" do tokens_scanned_from("1 / /./").should match_tokens2(:NUMBER, :DIV, :REGEX) end context 'when lexer lexes heredoc' do it 'lexes tag, syntax and escapes, margin and right trim' do code = <<-CODE @(END:syntax/t) Tex\\tt\\n |- END CODE tokens_scanned_from(code).should match_tokens2([:HEREDOC, 'syntax'], [:STRING, "Tex\tt\\n"]) end it 'lexes "tag", syntax and escapes, margin, right trim and interpolation' do code = <<-CODE @("END":syntax/t) Tex\\tt\\n$var After |- END CODE tokens_scanned_from(code).should match_tokens2( [:HEREDOC, 'syntax'], [:DQPRE, "Tex\tt\\n"], [:VARIABLE, "var"], [:DQPOST, " After"] ) end end it 'should support unicode characters' do code = <<-CODE "x\\u2713y" CODE if Puppet::Pops::Parser::Locator::RUBYVER < Puppet::Pops::Parser::Locator::RUBY_1_9_3 # Ruby 1.8.7 reports the multibyte char as several octal characters tokens_scanned_from(code).should match_tokens2([:STRING, "x\342\234\223y"]) else # >= Ruby 1.9.3 reports \u tokens_scanned_from(code).should match_tokens2([:STRING, "x\u2713y"]) end end context 'when lexing epp' do it 'epp can contain just text' do code = <<-CODE This is just text CODE epp_tokens_scanned_from(code).should match_tokens2([:RENDER_STRING, " This is just text\n"]) end it 'epp can contain text with interpolated rendered expressions' do code = <<-CODE This is <%= $x %> just text CODE epp_tokens_scanned_from(code).should match_tokens2( [:RENDER_STRING, " This is "], [:RENDER_EXPR, nil], [:VARIABLE, "x"], [:RENDER_STRING, " just text\n"] ) end it 'epp can contain text with expressions that are not rendered' do code = <<-CODE This is <% $x=10 %> just text CODE epp_tokens_scanned_from(code).should match_tokens2( [:RENDER_STRING, " This is "], [:VARIABLE, "x"], :EQUALS, [:NUMBER, "10"], [:RENDER_STRING, " just text\n"] ) end it 'epp can skip leading space in tail text' do code = <<-CODE This is <% $x=10 -%> just text CODE epp_tokens_scanned_from(code).should match_tokens2( [:RENDER_STRING, " This is "], [:VARIABLE, "x"], :EQUALS, [:NUMBER, "10"], [:RENDER_STRING, "just text\n"] ) end it 'epp can skip comments' do code = <<-CODE This is <% $x=10 -%> <%# This is an epp comment -%> just text CODE epp_tokens_scanned_from(code).should match_tokens2( [:RENDER_STRING, " This is "], [:VARIABLE, "x"], :EQUALS, [:NUMBER, "10"], [:RENDER_STRING, "just text\n"] ) end it 'epp can escape epp tags' do code = <<-CODE This is <% $x=10 -%> <%% this is escaped epp %%> CODE epp_tokens_scanned_from(code).should match_tokens2( [:RENDER_STRING, " This is "], [:VARIABLE, "x"], :EQUALS, [:NUMBER, "10"], [:RENDER_STRING, "<% this is escaped epp %>\n"] ) end end end