516 lines
14 KiB
Ruby
516 lines
14 KiB
Ruby
# -*- coding: utf-8 -*- #
|
|
# frozen_string_literal: true
|
|
|
|
# stdlib
|
|
require 'strscan'
|
|
require 'cgi'
|
|
require 'set'
|
|
|
|
module Rouge
|
|
# @abstract
|
|
# A lexer transforms text into a stream of `[token, chunk]` pairs.
|
|
class Lexer
|
|
include Token::Tokens
|
|
|
|
@option_docs = {}
|
|
|
|
class << self
|
|
# Lexes `stream` with the given options. The lex is delegated to a
|
|
# new instance.
|
|
#
|
|
# @see #lex
|
|
def lex(stream, opts={}, &b)
|
|
new(opts).lex(stream, &b)
|
|
end
|
|
|
|
# In case #continue_lex is called statically, we simply
|
|
# begin a new lex from the beginning, since there is no state.
|
|
#
|
|
# @see #continue_lex
|
|
def continue_lex(*a, &b)
|
|
lex(*a, &b)
|
|
end
|
|
|
|
# Given a name in string, return the correct lexer class.
|
|
# @param [String] name
|
|
# @return [Class<Rouge::Lexer>,nil]
|
|
def find(name)
|
|
registry[name.to_s]
|
|
end
|
|
|
|
# Find a lexer, with fancy shiny features.
|
|
#
|
|
# * The string you pass can include CGI-style options
|
|
#
|
|
# Lexer.find_fancy('erb?parent=tex')
|
|
#
|
|
# * You can pass the special name 'guess' so we guess for you,
|
|
# and you can pass a second argument of the code to guess by
|
|
#
|
|
# Lexer.find_fancy('guess', "#!/bin/bash\necho Hello, world")
|
|
#
|
|
# If the code matches more than one lexer then Guesser::Ambiguous
|
|
# is raised.
|
|
#
|
|
# This is used in the Redcarpet plugin as well as Rouge's own
|
|
# markdown lexer for highlighting internal code blocks.
|
|
#
|
|
def find_fancy(str, code=nil, additional_options={})
|
|
|
|
if str && !str.include?('?') && str != 'guess'
|
|
lexer_class = find(str)
|
|
return lexer_class && lexer_class.new(additional_options)
|
|
end
|
|
|
|
name, opts = str ? str.split('?', 2) : [nil, '']
|
|
|
|
# parse the options hash from a cgi-style string
|
|
opts = CGI.parse(opts || '').map do |k, vals|
|
|
val = case vals.size
|
|
when 0 then true
|
|
when 1 then vals[0]
|
|
else vals
|
|
end
|
|
|
|
[ k.to_s, val ]
|
|
end
|
|
|
|
opts = additional_options.merge(Hash[opts])
|
|
|
|
lexer_class = case name
|
|
when 'guess', nil
|
|
self.guess(:source => code, :mimetype => opts['mimetype'])
|
|
when String
|
|
self.find(name)
|
|
end
|
|
|
|
lexer_class && lexer_class.new(opts)
|
|
end
|
|
|
|
# Specify or get this lexer's title. Meant to be human-readable.
|
|
def title(t=nil)
|
|
if t.nil?
|
|
t = tag.capitalize
|
|
end
|
|
@title ||= t
|
|
end
|
|
|
|
# Specify or get this lexer's description.
|
|
def desc(arg=:absent)
|
|
if arg == :absent
|
|
@desc
|
|
else
|
|
@desc = arg
|
|
end
|
|
end
|
|
|
|
def option_docs
|
|
@option_docs ||= InheritableHash.new(superclass.option_docs)
|
|
end
|
|
|
|
def option(name, desc)
|
|
option_docs[name.to_s] = desc
|
|
end
|
|
|
|
# Specify or get the path name containing a small demo for
|
|
# this lexer (can be overriden by {demo}).
|
|
def demo_file(arg=:absent)
|
|
return @demo_file = Pathname.new(arg) unless arg == :absent
|
|
|
|
@demo_file = Pathname.new(File.join(__dir__, 'demos', tag))
|
|
end
|
|
|
|
# Specify or get a small demo string for this lexer
|
|
def demo(arg=:absent)
|
|
return @demo = arg unless arg == :absent
|
|
|
|
@demo = File.read(demo_file, mode: 'rt:bom|utf-8')
|
|
end
|
|
|
|
# @return a list of all lexers.
|
|
def all
|
|
@all ||= registry.values.uniq
|
|
end
|
|
|
|
# Guess which lexer to use based on a hash of info.
|
|
#
|
|
# This accepts the same arguments as Lexer.guess, but will never throw
|
|
# an error. It will return a (possibly empty) list of potential lexers
|
|
# to use.
|
|
def guesses(info={})
|
|
mimetype, filename, source = info.values_at(:mimetype, :filename, :source)
|
|
custom_globs = info[:custom_globs]
|
|
|
|
guessers = (info[:guessers] || []).dup
|
|
|
|
guessers << Guessers::Mimetype.new(mimetype) if mimetype
|
|
guessers << Guessers::GlobMapping.by_pairs(custom_globs, filename) if custom_globs && filename
|
|
guessers << Guessers::Filename.new(filename) if filename
|
|
guessers << Guessers::Modeline.new(source) if source
|
|
guessers << Guessers::Source.new(source) if source
|
|
guessers << Guessers::Disambiguation.new(filename, source) if source && filename
|
|
|
|
Guesser.guess(guessers, Lexer.all)
|
|
end
|
|
|
|
# Guess which lexer to use based on a hash of info.
|
|
#
|
|
# @option info :mimetype
|
|
# A mimetype to guess by
|
|
# @option info :filename
|
|
# A filename to guess by
|
|
# @option info :source
|
|
# The source itself, which, if guessing by mimetype or filename
|
|
# fails, will be searched for shebangs, <!DOCTYPE ...> tags, and
|
|
# other hints.
|
|
# @param [Proc] fallback called if multiple lexers are detected.
|
|
# If omitted, Guesser::Ambiguous is raised.
|
|
#
|
|
# @see Lexer.detect?
|
|
# @see Lexer.guesses
|
|
# @return [Class<Rouge::Lexer>]
|
|
def guess(info={}, &fallback)
|
|
lexers = guesses(info)
|
|
|
|
return Lexers::PlainText if lexers.empty?
|
|
return lexers[0] if lexers.size == 1
|
|
|
|
if fallback
|
|
fallback.call(lexers)
|
|
else
|
|
raise Guesser::Ambiguous.new(lexers)
|
|
end
|
|
end
|
|
|
|
def guess_by_mimetype(mt)
|
|
guess :mimetype => mt
|
|
end
|
|
|
|
def guess_by_filename(fname)
|
|
guess :filename => fname
|
|
end
|
|
|
|
def guess_by_source(source)
|
|
guess :source => source
|
|
end
|
|
|
|
def enable_debug!
|
|
@debug_enabled = true
|
|
end
|
|
|
|
def disable_debug!
|
|
remove_instance_variable :@debug_enabled if defined? @debug_enabled
|
|
end
|
|
|
|
def debug_enabled?
|
|
(defined? @debug_enabled) ? true : false
|
|
end
|
|
|
|
# Determine if a lexer has a method named +:detect?+ defined in its
|
|
# singleton class.
|
|
def detectable?
|
|
@detectable ||= methods(false).include?(:detect?)
|
|
end
|
|
|
|
protected
|
|
# @private
|
|
def register(name, lexer)
|
|
# reset an existing list of lexers
|
|
@all = nil if defined?(@all)
|
|
registry[name.to_s] = lexer
|
|
end
|
|
|
|
public
|
|
# Used to specify or get the canonical name of this lexer class.
|
|
#
|
|
# @example
|
|
# class MyLexer < Lexer
|
|
# tag 'foo'
|
|
# end
|
|
#
|
|
# MyLexer.tag # => 'foo'
|
|
#
|
|
# Lexer.find('foo') # => MyLexer
|
|
def tag(t=nil)
|
|
return @tag if t.nil?
|
|
|
|
@tag = t.to_s
|
|
Lexer.register(@tag, self)
|
|
end
|
|
|
|
# Used to specify alternate names this lexer class may be found by.
|
|
#
|
|
# @example
|
|
# class Erb < Lexer
|
|
# tag 'erb'
|
|
# aliases 'eruby', 'rhtml'
|
|
# end
|
|
#
|
|
# Lexer.find('eruby') # => Erb
|
|
def aliases(*args)
|
|
args.map!(&:to_s)
|
|
args.each { |arg| Lexer.register(arg, self) }
|
|
(@aliases ||= []).concat(args)
|
|
end
|
|
|
|
# Specify a list of filename globs associated with this lexer.
|
|
#
|
|
# If a filename glob is associated with more than one lexer, this can
|
|
# cause a Guesser::Ambiguous error to be raised in various guessing
|
|
# methods. These errors can be avoided by disambiguation. Filename globs
|
|
# are disambiguated in one of two ways. Either the lexer will define a
|
|
# `self.detect?` method (intended for use with shebangs and doctypes) or a
|
|
# manual rule will be specified in Guessers::Disambiguation.
|
|
#
|
|
# @example
|
|
# class Ruby < Lexer
|
|
# filenames '*.rb', '*.ruby', 'Gemfile', 'Rakefile'
|
|
# end
|
|
def filenames(*fnames)
|
|
(@filenames ||= []).concat(fnames)
|
|
end
|
|
|
|
# Specify a list of mimetypes associated with this lexer.
|
|
#
|
|
# @example
|
|
# class Html < Lexer
|
|
# mimetypes 'text/html', 'application/xhtml+xml'
|
|
# end
|
|
def mimetypes(*mts)
|
|
(@mimetypes ||= []).concat(mts)
|
|
end
|
|
|
|
# @private
|
|
def assert_utf8!(str)
|
|
encoding = str.encoding.name
|
|
return if encoding == 'US-ASCII' || encoding == 'UTF-8' || encoding == 'ASCII-8BIT'
|
|
|
|
raise EncodingError.new(
|
|
"Bad encoding: #{str.encoding.names.join(',')}. " +
|
|
"Please convert your string to UTF-8."
|
|
)
|
|
end
|
|
|
|
private
|
|
def registry
|
|
@registry ||= {}
|
|
end
|
|
end
|
|
|
|
# -*- instance methods -*- #
|
|
|
|
attr_reader :options
|
|
# Create a new lexer with the given options. Individual lexers may
|
|
# specify extra options. The only current globally accepted option
|
|
# is `:debug`.
|
|
#
|
|
# @option opts :debug
|
|
# Prints debug information to stdout. The particular info depends
|
|
# on the lexer in question. In regex lexers, this will log the
|
|
# state stack at the beginning of each step, along with each regex
|
|
# tried and each stream consumed. Try it, it's pretty useful.
|
|
def initialize(opts={})
|
|
@options = {}
|
|
opts.each { |k, v| @options[k.to_s] = v }
|
|
|
|
@debug = Lexer.debug_enabled? && bool_option('debug')
|
|
end
|
|
|
|
def as_bool(val)
|
|
case val
|
|
when nil, false, 0, '0', 'off'
|
|
false
|
|
when Array
|
|
val.empty? ? true : as_bool(val.last)
|
|
else
|
|
true
|
|
end
|
|
end
|
|
|
|
def as_string(val)
|
|
return as_string(val.last) if val.is_a?(Array)
|
|
|
|
val ? val.to_s : nil
|
|
end
|
|
|
|
def as_list(val)
|
|
case val
|
|
when Array
|
|
val.flat_map { |v| as_list(v) }
|
|
when String
|
|
val.split(',')
|
|
else
|
|
[]
|
|
end
|
|
end
|
|
|
|
def as_lexer(val)
|
|
return as_lexer(val.last) if val.is_a?(Array)
|
|
return val.new(@options) if val.is_a?(Class) && val < Lexer
|
|
|
|
case val
|
|
when Lexer
|
|
val
|
|
when String
|
|
lexer_class = Lexer.find(val)
|
|
lexer_class && lexer_class.new(@options)
|
|
end
|
|
end
|
|
|
|
def as_token(val)
|
|
return as_token(val.last) if val.is_a?(Array)
|
|
case val
|
|
when Token
|
|
val
|
|
else
|
|
Token[val]
|
|
end
|
|
end
|
|
|
|
def bool_option(name, &default)
|
|
name_str = name.to_s
|
|
|
|
if @options.key?(name_str)
|
|
as_bool(@options[name_str])
|
|
else
|
|
default ? default.call : false
|
|
end
|
|
end
|
|
|
|
def string_option(name, &default)
|
|
as_string(@options.delete(name.to_s, &default))
|
|
end
|
|
|
|
def lexer_option(name, &default)
|
|
as_lexer(@options.delete(name.to_s, &default))
|
|
end
|
|
|
|
def list_option(name, &default)
|
|
as_list(@options.delete(name.to_s, &default))
|
|
end
|
|
|
|
def token_option(name, &default)
|
|
as_token(@options.delete(name.to_s, &default))
|
|
end
|
|
|
|
def hash_option(name, defaults, &val_cast)
|
|
name = name.to_s
|
|
out = defaults.dup
|
|
|
|
base = @options.delete(name.to_s)
|
|
base = {} unless base.is_a?(Hash)
|
|
base.each { |k, v| out[k.to_s] = val_cast ? val_cast.call(v) : v }
|
|
|
|
@options.keys.each do |key|
|
|
next unless key =~ /(\w+)\[(\w+)\]/ and $1 == name
|
|
value = @options.delete(key)
|
|
|
|
out[$2] = val_cast ? val_cast.call(value) : value
|
|
end
|
|
|
|
out
|
|
end
|
|
|
|
# @abstract
|
|
#
|
|
# Called after each lex is finished. The default implementation
|
|
# is a noop.
|
|
def reset!
|
|
end
|
|
|
|
# Given a string, yield [token, chunk] pairs. If no block is given,
|
|
# an enumerator is returned.
|
|
#
|
|
# @option opts :continue
|
|
# Continue the lex from the previous state (i.e. don't call #reset!)
|
|
#
|
|
# @note The use of :continue => true has been deprecated. A warning is
|
|
# issued if run with `$VERBOSE` set to true.
|
|
#
|
|
# @note The use of arbitrary `opts` has never been supported, but we
|
|
# previously ignored them with no error. We now warn unconditionally.
|
|
def lex(string, opts=nil, &b)
|
|
if opts
|
|
if (opts.keys - [:continue]).size > 0
|
|
# improper use of options hash
|
|
warn('Improper use of Lexer#lex - this method does not receive options.' +
|
|
' This will become an error in a future version.')
|
|
end
|
|
|
|
if opts[:continue]
|
|
warn '`lex :continue => true` is deprecated, please use #continue_lex instead'
|
|
return continue_lex(string, &b)
|
|
end
|
|
end
|
|
|
|
return enum_for(:lex, string) unless block_given?
|
|
|
|
Lexer.assert_utf8!(string)
|
|
reset!
|
|
|
|
continue_lex(string, &b)
|
|
end
|
|
|
|
# Continue the lex from the the current state without resetting
|
|
def continue_lex(string, &b)
|
|
return enum_for(:continue_lex, string, &b) unless block_given?
|
|
|
|
# consolidate consecutive tokens of the same type
|
|
last_token = nil
|
|
last_val = nil
|
|
stream_tokens(string) do |tok, val|
|
|
next if val.empty?
|
|
|
|
if tok == last_token
|
|
last_val << val
|
|
next
|
|
end
|
|
|
|
b.call(last_token, last_val) if last_token
|
|
last_token = tok
|
|
last_val = val
|
|
end
|
|
|
|
b.call(last_token, last_val) if last_token
|
|
end
|
|
|
|
# delegated to {Lexer.tag}
|
|
def tag
|
|
self.class.tag
|
|
end
|
|
|
|
# @abstract
|
|
#
|
|
# Yield `[token, chunk]` pairs, given a prepared input stream. This
|
|
# must be implemented.
|
|
#
|
|
# @param [StringScanner] stream
|
|
# the stream
|
|
def stream_tokens(stream, &b)
|
|
raise 'abstract'
|
|
end
|
|
|
|
# @abstract
|
|
#
|
|
# Return true if there is an in-text indication (such as a shebang
|
|
# or DOCTYPE declaration) that this lexer should be used.
|
|
#
|
|
# @param [TextAnalyzer] text
|
|
# the text to be analyzed, with a couple of handy methods on it,
|
|
# like {TextAnalyzer#shebang?} and {TextAnalyzer#doctype?}
|
|
def self.detect?(text)
|
|
false
|
|
end
|
|
end
|
|
|
|
module Lexers
|
|
@_loaded_lexers = {}
|
|
|
|
def self.load_lexer(relpath)
|
|
return if @_loaded_lexers.key?(relpath)
|
|
@_loaded_lexers[relpath] = true
|
|
load File.join(__dir__, 'lexers', relpath)
|
|
end
|
|
end
|
|
end
|