diff --git a/lib/puppet/pops/parser/lexer2.rb b/lib/puppet/pops/parser/lexer2.rb index e65e56d24..6f352931a 100644 --- a/lib/puppet/pops/parser/lexer2.rb +++ b/lib/puppet/pops/parser/lexer2.rb @@ -1,694 +1,694 @@ # The Lexer is responsbile for turning source text into tokens. # This version is a performance enhanced lexer (in comparison to the 3.x and earlier "future parser" lexer. # # Old returns tokens [:KEY, value, { locator = } # Could return [[token], locator] # or Token.new([token], locator) with the same API x[0] = token_symbol, x[1] = self, x[:key] = (:value, :file, :line, :pos) etc require 'strscan' require 'puppet/pops/parser/lexer_support' require 'puppet/pops/parser/heredoc_support' require 'puppet/pops/parser/interpolation_support' require 'puppet/pops/parser/epp_support' require 'puppet/pops/parser/slurp_support' class Puppet::Pops::Parser::Lexer2 include Puppet::Pops::Parser::LexerSupport include Puppet::Pops::Parser::HeredocSupport include Puppet::Pops::Parser::InterpolationSupport include Puppet::Pops::Parser::SlurpSupport include Puppet::Pops::Parser::EppSupport # ALl tokens have three slots, the token name (a Symbol), the token text (String), and a token text length. # All operator and punctuation tokens reuse singleton arrays Tokens that require unique values create # a unique array per token. # # PEFORMANCE NOTES: # This construct reduces the amount of object that needs to be created for operators and punctuation. # The length is pre-calculated for all singleton tokens. The length is used both to signal the length of # the token, and to advance the scanner position (without having to advance it with a scan(regexp)). # TOKEN_LBRACK = [:LBRACK, '['.freeze, 1].freeze TOKEN_LISTSTART = [:LISTSTART, '['.freeze, 1].freeze TOKEN_RBRACK = [:RBRACK, ']'.freeze, 1].freeze TOKEN_LBRACE = [:LBRACE, '{'.freeze, 1].freeze TOKEN_RBRACE = [:RBRACE, '}'.freeze, 1].freeze TOKEN_SELBRACE = [:SELBRACE, '{'.freeze, 1].freeze TOKEN_LPAREN = [:LPAREN, '('.freeze, 1].freeze TOKEN_RPAREN = [:RPAREN, ')'.freeze, 1].freeze TOKEN_EQUALS = [:EQUALS, '='.freeze, 1].freeze TOKEN_APPENDS = [:APPENDS, '+='.freeze, 2].freeze TOKEN_DELETES = [:DELETES, '-='.freeze, 2].freeze TOKEN_ISEQUAL = [:ISEQUAL, '=='.freeze, 2].freeze TOKEN_NOTEQUAL = [:NOTEQUAL, '!='.freeze, 2].freeze TOKEN_MATCH = [:MATCH, '=~'.freeze, 2].freeze TOKEN_NOMATCH = [:NOMATCH, '!~'.freeze, 2].freeze TOKEN_GREATEREQUAL = [:GREATEREQUAL, '>='.freeze, 2].freeze TOKEN_GREATERTHAN = [:GREATERTHAN, '>'.freeze, 1].freeze TOKEN_LESSEQUAL = [:LESSEQUAL, '<='.freeze, 2].freeze TOKEN_LESSTHAN = [:LESSTHAN, '<'.freeze, 1].freeze TOKEN_FARROW = [:FARROW, '=>'.freeze, 2].freeze TOKEN_PARROW = [:PARROW, '+>'.freeze, 2].freeze TOKEN_LSHIFT = [:LSHIFT, '<<'.freeze, 2].freeze TOKEN_LLCOLLECT = [:LLCOLLECT, '<<|'.freeze, 3].freeze TOKEN_LCOLLECT = [:LCOLLECT, '<|'.freeze, 2].freeze TOKEN_RSHIFT = [:RSHIFT, '>>'.freeze, 2].freeze TOKEN_RRCOLLECT = [:RRCOLLECT, '|>>'.freeze, 3].freeze TOKEN_RCOLLECT = [:RCOLLECT, '|>'.freeze, 2].freeze TOKEN_PLUS = [:PLUS, '+'.freeze, 1].freeze TOKEN_MINUS = [:MINUS, '-'.freeze, 1].freeze TOKEN_DIV = [:DIV, '/'.freeze, 1].freeze TOKEN_TIMES = [:TIMES, '*'.freeze, 1].freeze TOKEN_MODULO = [:MODULO, '%'.freeze, 1].freeze TOKEN_NOT = [:NOT, '!'.freeze, 1].freeze TOKEN_DOT = [:DOT, '.'.freeze, 1].freeze TOKEN_PIPE = [:PIPE, '|'.freeze, 1].freeze TOKEN_AT = [:AT , '@'.freeze, 1].freeze TOKEN_ATAT = [:ATAT , '@@'.freeze, 2].freeze TOKEN_COLON = [:COLON, ':'.freeze, 1].freeze TOKEN_COMMA = [:COMMA, ','.freeze, 1].freeze TOKEN_SEMIC = [:SEMIC, ';'.freeze, 1].freeze TOKEN_QMARK = [:QMARK, '?'.freeze, 1].freeze TOKEN_TILDE = [:TILDE, '~'.freeze, 1].freeze # lexed but not an operator in Puppet TOKEN_REGEXP = [:REGEXP, nil, 0].freeze TOKEN_IN_EDGE = [:IN_EDGE, '->'.freeze, 2].freeze TOKEN_IN_EDGE_SUB = [:IN_EDGE_SUB, '~>'.freeze, 2].freeze TOKEN_OUT_EDGE = [:OUT_EDGE, '<-'.freeze, 2].freeze TOKEN_OUT_EDGE_SUB = [:OUT_EDGE_SUB, '<~'.freeze, 2].freeze # Tokens that are always unique to what has been lexed TOKEN_STRING = [:STRING, nil, 0].freeze TOKEN_WORD = [:WORD, nil, 0].freeze TOKEN_DQPRE = [:DQPRE, nil, 0].freeze TOKEN_DQMID = [:DQPRE, nil, 0].freeze TOKEN_DQPOS = [:DQPRE, nil, 0].freeze TOKEN_NUMBER = [:NUMBER, nil, 0].freeze TOKEN_VARIABLE = [:VARIABLE, nil, 1].freeze TOKEN_VARIABLE_EMPTY = [:VARIABLE, ''.freeze, 1].freeze # HEREDOC has syntax as an argument. TOKEN_HEREDOC = [:HEREDOC, nil, 0].freeze # EPP_START is currently a marker token, may later get syntax TOKEN_EPPSTART = [:EPP_START, nil, 0].freeze TOKEN_EPPEND = [:EPP_END, '%>', 2].freeze TOKEN_EPPEND_TRIM = [:EPP_END_TRIM, '-%>', 3].freeze # This is used for unrecognized tokens, will always be a single character. This particular instance # is not used, but is kept here for documentation purposes. TOKEN_OTHER = [:OTHER, nil, 0] # Keywords are all singleton tokens with pre calculated lengths. # Booleans are pre-calculated (rather than evaluating the strings "false" "true" repeatedly. # KEYWORDS = { "case" => [:CASE, 'case', 4], "class" => [:CLASS, 'class', 5], "default" => [:DEFAULT, 'default', 7], "define" => [:DEFINE, 'define', 6], "if" => [:IF, 'if', 2], "elsif" => [:ELSIF, 'elsif', 5], "else" => [:ELSE, 'else', 4], "inherits" => [:INHERITS, 'inherits', 8], "node" => [:NODE, 'node', 4], "and" => [:AND, 'and', 3], "or" => [:OR, 'or', 2], "undef" => [:UNDEF, 'undef', 5], "false" => [:BOOLEAN, false, 5], "true" => [:BOOLEAN, true, 4], "in" => [:IN, 'in', 2], "unless" => [:UNLESS, 'unless', 6], "function" => [:FUNCTION, 'function', 8], "type" => [:TYPE, 'type', 4], "attr" => [:ATTR, 'attr', 4], "private" => [:PRIVATE, 'private', 7], } KEYWORDS.each {|k,v| v[1].freeze; v.freeze } KEYWORDS.freeze # Reverse lookup of keyword name to string KEYWORD_NAMES = {} KEYWORDS.each {|k, v| KEYWORD_NAMES[v[0]] = k } KEYWORD_NAMES.freeze PATTERN_WS = %r{[[:blank:]\r]+} # The single line comment includes the line ending. PATTERN_COMMENT = %r{#.*\r?} PATTERN_MLCOMMENT = %r{/\*(.*?)\*/}m PATTERN_REGEX = %r{/[^/\n]*/} PATTERN_REGEX_END = %r{/} PATTERN_REGEX_A = %r{\A/} # for replacement to "" PATTERN_REGEX_Z = %r{/\Z} # for replacement to "" PATTERN_REGEX_ESC = %r{\\/} # for replacement to "/" # The 3x patterns: # PATTERN_CLASSREF = %r{((::){0,1}[A-Z][-\w]*)+} # PATTERN_NAME = %r{((::)?[a-z0-9][-\w]*)(::[a-z0-9][-\w]*)*} # The NAME and CLASSREF in 4x are strict. Each segment must start with # a letter a-z and may not contain dashes (\w includes letters, digits and _). # PATTERN_CLASSREF = %r{((::){0,1}[A-Z][\w]*)+} PATTERN_NAME = %r{^((::)?[a-z][\w]*)(::[a-z][\w]*)*$} PATTERN_BARE_WORD = %r{((?:::){0,1}(?:[a-z_](?:[\w-]*[\w])?))+} PATTERN_DOLLAR_VAR = %r{\$(::)?(\w+::)*\w+} PATTERN_NUMBER = %r{\b(?:0[xX][0-9A-Fa-f]+|0?\d+(?:\.\d+)?(?:[eE]-?\d+)?)\b} # PERFORMANCE NOTE: # Comparison against a frozen string is faster (than unfrozen). # STRING_BSLASH_BSLASH = '\\'.freeze attr_reader :locator def initialize() end # Clears the lexer state (it is not required to call this as it will be garbage collected # and the next lex call (lex_string, lex_file) will reset the internal state. # def clear() # not really needed, but if someone wants to ensure garbage is collected as early as possible @scanner = nil @locator = nil @lexing_context = nil end # Convenience method, and for compatibility with older lexer. Use the lex_string instead which allows # passing the path to use without first having to call file= (which reads the file if it exists). # (Bad form to use overloading of assignment operator for something that is not really an assignment. Also, # overloading of = does not allow passing more than one argument). # def string=(string) lex_string(string, '') end def lex_string(string, path='') initvars @scanner = StringScanner.new(string) @locator = Puppet::Pops::Parser::Locator.locator(string, path) end # Lexes an unquoted string. # @param string [String] the string to lex # @param locator [Puppet::Pops::Parser::Locator] the locator to use (a default is used if nil is given) # @param escapes [Array] array of character strings representing the escape sequences to transform # @param interpolate [Boolean] whether interpolation of expressions should be made or not. # def lex_unquoted_string(string, locator, escapes, interpolate) initvars @scanner = StringScanner.new(string) @locator = locator || Puppet::Pops::Parser::Locator.locator(string, '') @lexing_context[:escapes] = escapes || UQ_ESCAPES - @lexing_context[:uq_slurp_pattern] = interpolate ? (!escapes.include?('$') ? SLURP_UQNE_PATTERN : SLURP_UQ_PATTERN) : SLURP_ALL_PATTERN + @lexing_context[:uq_slurp_pattern] = interpolate ? (escapes.include?('$') ? SLURP_UQ_PATTERN : SLURP_UQNE_PATTERN) : SLURP_ALL_PATTERN end # Convenience method, and for compatibility with older lexer. Use the lex_file instead. # (Bad form to use overloading of assignment operator for something that is not really an assignment). # def file=(file) lex_file(file) end # TODO: This method should not be used, callers should get the locator since it is most likely required to # compute line, position etc given offsets. # def file @locator ? @locator.file : nil end # Initializes lexing of the content of the given file. An empty string is used if the file does not exist. # def lex_file(file) initvars contents = Puppet::FileSystem.exist?(file) ? Puppet::FileSystem.read(file) : "" @scanner = StringScanner.new(contents.freeze) @locator = Puppet::Pops::Parser::Locator.locator(contents, file) end def initvars @token_queue = [] # NOTE: additional keys are used; :escapes, :uq_slurp_pattern, :newline_jump, :epp_* @lexing_context = { :brace_count => 0, :after => nil, } end # Scans all of the content and returns it in an array # Note that the terminating [false, false] token is included in the result. # def fullscan result = [] scan {|token, value| result.push([token, value]) } result end # A block must be passed to scan. It will be called with two arguments, a symbol for the token, # and an instance of LexerSupport::TokenValue # PERFORMANCE NOTE: The TokenValue is designed to reduce the amount of garbage / temporary data # and to only convert the lexer's internal tokens on demand. It is slightly more costly to create an # instance of a class defined in Ruby than an Array or Hash, but the gain is much bigger since transformation # logic is avoided for many of its members (most are never used (e.g. line/pos information which is only of # value in general for error messages, and for some expressions (which the lexer does not know about). # def scan # PERFORMANCE note: it is faster to access local variables than instance variables. # This makes a small but notable difference since instance member access is avoided for # every token in the lexed content. # scn = @scanner ctx = @lexing_context queue = @token_queue lex_error_without_pos(Puppet::Pops::Issues::NO_INPUT_TO_LEXER) unless scn scn.skip(PATTERN_WS) # This is the lexer's main loop until queue.empty? && scn.eos? do if token = queue.shift || lex_token ctx[:after] = token[0] yield token end end # Signals end of input yield [false, false] end # This lexes one token at the current position of the scanner. # PERFORMANCE NOTE: Any change to this logic should be performance measured. # def lex_token # Using three char look ahead (may be faster to do 2 char look ahead since only 2 tokens require a third scn = @scanner ctx = @lexing_context before = @scanner.pos # A look ahead of 3 characters is used since the longest operator ambiguity is resolved at that point. # PERFORMANCE NOTE: It is faster to peek once and use three separate variables for lookahead 0, 1 and 2. # la = scn.peek(3) return nil if la.empty? # Ruby 1.8.7 requires using offset and length (or integers are returned. # PERFORMANCE NOTE. # It is slightly faster to use these local variables than accessing la[0], la[1] etc. in ruby 1.9.3 # But not big enough to warrant two completely different implementations. # la0 = la[0,1] la1 = la[1,1] la2 = la[2,1] # PERFORMANCE NOTE: # A case when, where all the cases are literal values is the fastest way to map from data to code. # It is much faster than using a hash with lambdas, hash with symbol used to then invoke send etc. # This case statement is evaluated for most character positions in puppet source, and great care must # be taken to not introduce performance regressions. # case la0 when '.' emit(TOKEN_DOT, before) when ',' emit(TOKEN_COMMA, before) when '[' if (before == 0 || scn.string[locator.char_offset(before)-1,1] =~ /[[:blank:]\r\n]+/) emit(TOKEN_LISTSTART, before) else emit(TOKEN_LBRACK, before) end when ']' emit(TOKEN_RBRACK, before) when '(' emit(TOKEN_LPAREN, before) when ')' emit(TOKEN_RPAREN, before) when ';' emit(TOKEN_SEMIC, before) when '?' emit(TOKEN_QMARK, before) when '*' emit(TOKEN_TIMES, before) when '%' if la1 == '>' && ctx[:epp_mode] scn.pos += 2 if ctx[:epp_mode] == :expr enqueue_completed(TOKEN_EPPEND, before) end ctx[:epp_mode] = :text interpolate_epp else emit(TOKEN_MODULO, before) end when '{' # The lexer needs to help the parser since the technology used cannot deal with # lookahead of same token with different precedence. This is solved by making left brace # after ? into a separate token. # ctx[:brace_count] += 1 emit(if ctx[:after] == :QMARK TOKEN_SELBRACE else TOKEN_LBRACE end, before) when '}' ctx[:brace_count] -= 1 emit(TOKEN_RBRACE, before) # TOKENS @, @@, @( when '@' case la1 when '@' emit(TOKEN_ATAT, before) # TODO; Check if this is good for the grammar when '(' heredoc else emit(TOKEN_AT, before) end # TOKENS |, |>, |>> when '|' emit(case la1 when '>' la2 == '>' ? TOKEN_RRCOLLECT : TOKEN_RCOLLECT else TOKEN_PIPE end, before) # TOKENS =, =>, ==, =~ when '=' emit(case la1 when '=' TOKEN_ISEQUAL when '>' TOKEN_FARROW when '~' TOKEN_MATCH else TOKEN_EQUALS end, before) # TOKENS '+', '+=', and '+>' when '+' emit(case la1 when '=' TOKEN_APPENDS when '>' TOKEN_PARROW else TOKEN_PLUS end, before) # TOKENS '-', '->', and epp '-%>' (end of interpolation with trim) when '-' if ctx[:epp_mode] && la1 == '%' && la2 == '>' scn.pos += 3 if ctx[:epp_mode] == :expr enqueue_completed(TOKEN_EPPEND_TRIM, before) end interpolate_epp(:with_trim) else emit(case la1 when '>' TOKEN_IN_EDGE when '=' TOKEN_DELETES else TOKEN_MINUS end, before) end # TOKENS !, !=, !~ when '!' emit(case la1 when '=' TOKEN_NOTEQUAL when '~' TOKEN_NOMATCH else TOKEN_NOT end, before) # TOKENS ~>, ~ when '~' emit(la1 == '>' ? TOKEN_IN_EDGE_SUB : TOKEN_TILDE, before) when '#' scn.skip(PATTERN_COMMENT) nil # TOKENS '/', '/*' and '/ regexp /' when '/' case la1 when '*' scn.skip(PATTERN_MLCOMMENT) nil else # regexp position is a regexp, else a div if regexp_acceptable? && value = scn.scan(PATTERN_REGEX) # Ensure an escaped / was not matched while value[-2..-2] == STRING_BSLASH_BSLASH # i.e. \\ value += scn.scan_until(PATTERN_REGEX_END) end regex = value.sub(PATTERN_REGEX_A, '').sub(PATTERN_REGEX_Z, '').gsub(PATTERN_REGEX_ESC, '/') emit_completed([:REGEX, Regexp.new(regex), scn.pos-before], before) else emit(TOKEN_DIV, before) end end # TOKENS <, <=, <|, <<|, <<, <-, <~ when '<' emit(case la1 when '<' if la2 == '|' TOKEN_LLCOLLECT else TOKEN_LSHIFT end when '=' TOKEN_LESSEQUAL when '|' TOKEN_LCOLLECT when '-' TOKEN_OUT_EDGE when '~' TOKEN_OUT_EDGE_SUB else TOKEN_LESSTHAN end, before) # TOKENS >, >=, >> when '>' emit(case la1 when '>' TOKEN_RSHIFT when '=' TOKEN_GREATEREQUAL else TOKEN_GREATERTHAN end, before) # TOKENS :, ::CLASSREF, ::NAME when ':' if la1 == ':' before = scn.pos # PERFORMANCE NOTE: This could potentially be speeded up by using a case/when listing all # upper case letters. Alternatively, the 'A', and 'Z' comparisons may be faster if they are # frozen. # if la2 >= 'A' && la2 <= 'Z' # CLASSREF or error value = scn.scan(PATTERN_CLASSREF) if value after = scn.pos emit_completed([:CLASSREF, value.freeze, after-before], before) else # move to faulty position ('::' was ok) scn.pos = scn.pos + 3 lex_error(Puppet::Pops::Issues::ILLEGAL_FULLY_QUALIFIED_CLASS_REFERENCE) end else value = scn.scan(PATTERN_BARE_WORD) if value if value =~ PATTERN_NAME emit_completed([:NAME, value.freeze, scn.pos-before], before) else emit_completed([:WORD, value.freeze, scn.pos - before], before) end else # move to faulty position ('::' was ok) scn.pos = scn.pos + 2 lex_error(Puppet::Pops::Issues::ILLEGAL_FULLY_QUALIFIED_NAME) end end else emit(TOKEN_COLON, before) end when '$' if value = scn.scan(PATTERN_DOLLAR_VAR) emit_completed([:VARIABLE, value[1..-1].freeze, scn.pos - before], before) else # consume the $ and let higher layer complain about the error instead of getting a syntax error emit(TOKEN_VARIABLE_EMPTY, before) end when '"' # Recursive string interpolation, 'interpolate' either returns a STRING token, or # a DQPRE with the rest of the string's tokens placed in the @token_queue interpolate_dq when "'" emit_completed([:STRING, slurp_sqstring.freeze, scn.pos - before], before) when '0', '1', '2', '3', '4', '5', '6', '7', '8', '9' value = scn.scan(PATTERN_NUMBER) if value length = scn.pos - before assert_numeric(value, length) emit_completed([:NUMBER, value.freeze, length], before) else # move to faulty position ([0-9] was ok) scn.pos = scn.pos + 1 lex_error(Puppet::Pops::Issues::ILLEGAL_NUMBER) end when 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z', '_' value = scn.scan(PATTERN_BARE_WORD) if value && value =~ PATTERN_NAME emit_completed(KEYWORDS[value] || [:NAME, value.freeze, scn.pos - before], before) elsif value emit_completed([:WORD, value.freeze, scn.pos - before], before) else # move to faulty position ([a-z_] was ok) scn.pos = scn.pos + 1 fully_qualified = scn.match?(/::/) if fully_qualified lex_error(Puppet::Pops::Issues::ILLEGAL_FULLY_QUALIFIED_NAME) else lex_error(Puppet::Pops::Issues::ILLEGAL_NAME_OR_BARE_WORD) end end when 'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M', 'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z' value = scn.scan(PATTERN_CLASSREF) if value emit_completed([:CLASSREF, value.freeze, scn.pos - before], before) else # move to faulty position ([A-Z] was ok) scn.pos = scn.pos + 1 lex_error(Puppet::Pops::Issues::ILLEGAL_CLASS_REFERENCE) end when "\n" # If heredoc_cont is in effect there are heredoc text lines to skip over # otherwise just skip the newline. # if ctx[:newline_jump] scn.pos = ctx[:newline_jump] ctx[:newline_jump] = nil else scn.pos += 1 end return nil when ' ', "\t", "\r" scn.skip(PATTERN_WS) return nil else # In case of unicode spaces of various kinds that are captured by a regexp, but not by the # simpler case expression above (not worth handling those special cases with better performance). if scn.skip(PATTERN_WS) nil else # "unrecognized char" emit([:OTHER, la0, 1], before) end end end # Emits (produces) a token [:tokensymbol, TokenValue] and moves the scanner's position past the token # def emit(token, byte_offset) @scanner.pos = byte_offset + token[2] [token[0], TokenValue.new(token, byte_offset, @locator)] end # Emits the completed token on the form [:tokensymbol, TokenValue. This method does not alter # the scanner's position. # def emit_completed(token, byte_offset) [token[0], TokenValue.new(token, byte_offset, @locator)] end # Enqueues a completed token at the given offset def enqueue_completed(token, byte_offset) @token_queue << emit_completed(token, byte_offset) end # Allows subprocessors for heredoc etc to enqueue tokens that are tokenized by a different lexer instance # def enqueue(emitted_token) @token_queue << emitted_token end # Answers after which tokens it is acceptable to lex a regular expression. # PERFORMANCE NOTE: # It may be beneficial to turn this into a hash with default value of true for missing entries. # A case expression with literal values will however create a hash internally. Since a reference is # always needed to the hash, this access is almost as costly as a method call. # def regexp_acceptable? case @lexing_context[:after] # Ends of (potential) R-value generating expressions when :RPAREN, :RBRACK, :RRCOLLECT, :RCOLLECT false # End of (potential) R-value - but must be allowed because of case expressions # Called out here to not be mistaken for a bug. when :RBRACE true # Operands (that can be followed by DIV (even if illegal in grammar) when :NAME, :CLASSREF, :NUMBER, :STRING, :BOOLEAN, :DQPRE, :DQMID, :DQPOST, :HEREDOC, :REGEX, :VARIABLE, :WORD false else true end end end diff --git a/lib/puppet/pops/parser/slurp_support.rb b/lib/puppet/pops/parser/slurp_support.rb index d87cd2f32..c16441e90 100644 --- a/lib/puppet/pops/parser/slurp_support.rb +++ b/lib/puppet/pops/parser/slurp_support.rb @@ -1,111 +1,111 @@ # This module is an integral part of the Lexer. # It defines the string slurping behavior - finding the string and non string parts in interpolated # strings, translating escape sequences in strings to their single character equivalence. # # PERFORMANCE NOTE: The various kinds of slurping could be made even more generic, but requires # additional parameter passing and evaluation of conditional logic. # TODO: More detailed performance analysis of excessive character escaping and interpolation. # module Puppet::Pops::Parser::SlurpSupport include Puppet::Pops::Parser::LexerSupport SLURP_SQ_PATTERN = /(?:[^\\]|^|[^\\])(?:[\\]{2})*[']/ SLURP_DQ_PATTERN = /(?:[^\\]|^|[^\\])(?:[\\]{2})*(["]|[$]\{?)/ SLURP_UQ_PATTERN = /(?:[^\\]|^|[^\\])(?:[\\]{2})*([$]\{?|\z)/ # unquoted, no escapes - SLURP_UQNE_PATTERN = /([$]\{?|\z)/m + SLURP_UQNE_PATTERN = /(\$\{?|\z)/m SLURP_ALL_PATTERN = /.*(\z)/m SQ_ESCAPES = %w{ \\ ' } DQ_ESCAPES = %w{ \\ $ ' " r n t s u}+["\r\n", "\n"] UQ_ESCAPES = %w{ \\ $ r n t s u}+["\r\n", "\n"] def slurp_sqstring # skip the leading ' @scanner.pos += 1 str = slurp(@scanner, SLURP_SQ_PATTERN, SQ_ESCAPES, :ignore_invalid_escapes) lex_error(Puppet::Pops::Issues::UNCLOSED_QUOTE, :after => "\"'\"", :followed_by => followed_by) unless str str[0..-2] # strip closing "'" from result end def slurp_dqstring scn = @scanner last = scn.matched str = slurp(scn, SLURP_DQ_PATTERN, DQ_ESCAPES, false) unless str lex_error(Puppet::Pops::Issues::UNCLOSED_QUOTE, :after => format_quote(last), :followed_by => followed_by) end # Terminator may be a single char '"', '$', or two characters '${' group match 1 (scn[1]) from the last slurp holds this terminator = scn[1] [str[0..(-1 - terminator.length)], terminator] end # Copy from old lexer - can do much better def slurp_uqstring scn = @scanner last = scn.matched ignore = true str = slurp(scn, @lexing_context[:uq_slurp_pattern], @lexing_context[:escapes], :ignore_invalid_escapes) # Terminator may be a single char '$', two characters '${', or empty string '' at the end of intput. # Group match 1 holds this. # The exceptional case is found by looking at the subgroup 1 of the most recent match made by the scanner (i.e. @scanner[1]). # This is the last match made by the slurp method (having called scan_until on the scanner). # If there is a terminating character is must be stripped and returned separately. # terminator = scn[1] [str[0..(-1 - terminator.length)], terminator] end # Slurps a string from the given scanner until the given pattern and then replaces any escaped # characters given by escapes into their control-character equivalent or in case of line breaks, replaces the # pattern \r?\n with an empty string. # The returned string contains the terminating character. Returns nil if the scanner can not scan until the given # pattern. # def slurp(scanner, pattern, escapes, ignore_invalid_escapes) str = scanner.scan_until(pattern) || return # Process unicode escapes first as they require getting 4 hex digits # If later a \u is found it is warned not to be a unicode escape if escapes.include?('u') str.gsub!(/\\u(?:([\da-fA-F]{4})|\{([\da-fA-F]{1,6})\})/m) { [($1 || $2).hex].pack("U") } end begin str.gsub!(/\\([^\r\n]|(?:\r?\n))/m) { ch = $1 if escapes.include? ch case ch when 'r' ; "\r" when 'n' ; "\n" when 't' ; "\t" when 's' ; " " when 'u' lex_warning(Puppet::Pops::Issues::ILLEGAL_UNICODE_ESCAPE) "\\u" when "\n" ; '' when "\r\n"; '' else ch end else lex_warning(Puppet::Pops::Issues::UNRECOGNIZED_ESCAPE, :ch => ch) unless ignore_invalid_escapes "\\#{ch}" end } rescue ArgumentError => e # A invalid byte sequence may be the result of faulty input as well, but that could not possibly # have reached this far... Unfortunately there is no more specific error and a match on message is # required to differentiate from other internal problems. if e.message =~ /invalid byte sequence/ lex_error(Puppet::Pops::Issues::ILLEGAL_UNICODE_ESCAPE) else raise e end end str end end