# encoding: utf-8
require_relative 'scanner'
module Crass
# Tokenizes a CSS string.
#
# 4. http://dev.w3.org/csswg/css-syntax/#tokenization
class Tokenizer
RE_COMMENT_CLOSE = /\*\//
RE_DIGIT = /[0-9]+/
RE_ESCAPE = /\\[^\n]/
RE_HEX = /[0-9A-Fa-f]{1,6}/
RE_NAME = /[0-9A-Za-z_\u0080-\u{10ffff}-]+/
RE_NAME_START = /[A-Za-z_\u0080-\u{10ffff}]+/
RE_NON_PRINTABLE = /[\u0000-\u0008\u000b\u000e-\u001f\u007f]+/
RE_NUMBER_DECIMAL = /\.[0-9]+/
RE_NUMBER_EXPONENT = /[Ee][+-]?[0-9]+/
RE_NUMBER_SIGN = /[+-]/
RE_NUMBER_STR = /\A
(?<sign> [+-]?)
(?<integer> [0-9]*)
(?:\.
(?<fractional> [0-9]*)
)?
(?:[Ee]
(?<exponent_sign> [+-]?)
(?<exponent> [0-9]*)
)?
\z/x
RE_QUOTED_URL_START = /\A[\n\u0009\u0020]?["']/
RE_UNICODE_RANGE_START = /\+(?:[0-9A-Fa-f]|\?)/
RE_UNICODE_RANGE_END = /-[0-9A-Fa-f]/
RE_WHITESPACE = /[\n\u0009\u0020]+/
RE_WHITESPACE_ANCHORED = /\A[\n\u0009\u0020]+\z/
# -- Class Methods ---------------------------------------------------------
# Tokenizes the given _input_ as a CSS string and returns an array of
# tokens.
#
# See {#initialize} for _options_.
def self.tokenize(input, options = {})
Tokenizer.new(input, options).tokenize
end
# -- Instance Methods ------------------------------------------------------
# Initializes a new Tokenizer.
#
# Options:
#
# * **:preserve_comments** - If `true`, comments will be preserved as
# `:comment` tokens.
#
# * **:preserve_hacks** - If `true`, certain non-standard browser hacks
# such as the IE "*" hack will be preserved even though they violate
# CSS 3 syntax rules.
#
def initialize(input, options = {})
@s = Scanner.new(preprocess(input))
@options = options
end
# Consumes a token and returns the token that was consumed.
#
# 4.3.1. http://dev.w3.org/csswg/css-syntax/#consume-a-token
def consume
return nil if @s.eos?
@s.mark
# Consume comments.
if comment_token = consume_comments
if @options[:preserve_comments]
return comment_token
else
return consume
end
end
# Consume whitespace.
return create_token(:whitespace) if @s.scan(RE_WHITESPACE)
char = @s.consume
case char.to_sym
when :'"'
consume_string
when :'#'
if @s.peek =~ RE_NAME || valid_escape?(@s.peek(2))
create_token(:hash,
:type => start_identifier?(@s.peek(3)) ? :id : :unrestricted,
:value => consume_name)
else
create_token(:delim, :value => char)
end
when :'$'
if @s.peek == '='
@s.consume
create_token(:suffix_match)
else
create_token(:delim, :value => char)
end
when :"'"
consume_string
when :'('
create_token(:'(')
when :')'
create_token(:')')
when :*
if @s.peek == '='
@s.consume
create_token(:substring_match)
# Non-standard: Preserve the IE * hack.
elsif @options[:preserve_hacks] && @s.peek =~ RE_NAME_START
@s.reconsume
consume_ident
else
create_token(:delim, :value => char)
end
when :+
if start_number?
@s.reconsume
consume_numeric
else
create_token(:delim, :value => char)
end
when :','
create_token(:comma)
when :-
nextTwoChars = @s.peek(2)
nextThreeChars = char + nextTwoChars
if start_number?(nextThreeChars)
@s.reconsume
consume_numeric
elsif nextTwoChars == '->'
@s.consume
@s.consume
create_token(:cdc)
elsif start_identifier?(nextThreeChars)
@s.reconsume
consume_ident
else
create_token(:delim, :value => char)
end
when :'.'
if start_number?
@s.reconsume
consume_numeric
else
create_token(:delim, :value => char)
end
when :':'
create_token(:colon)
when :';'
create_token(:semicolon)
when :<
if @s.peek(3) == '!--'
@s.consume
@s.consume
@s.consume
create_token(:cdo)
else
create_token(:delim, :value => char)
end
when :'@'
if start_identifier?(@s.peek(3))
create_token(:at_keyword, :value => consume_name)
else
create_token(:delim, :value => char)
end
when :'['
create_token(:'[')
when :'\\'
if valid_escape?
@s.reconsume
consume_ident
else
# Parse error.
create_token(:delim,
:error => true,
:value => char)
end
when :']'
create_token(:']')
when :'^'
if @s.peek == '='
@s.consume
create_token(:prefix_match)
else
create_token(:delim, :value => char)
end
when :'{'
create_token(:'{')
when :'}'
create_token(:'}')
when :U, :u
if @s.peek(2) =~ RE_UNICODE_RANGE_START
@s.consume
consume_unicode_range
else
@s.reconsume
consume_ident
end
when :|
case @s.peek
when '='
@s.consume
create_token(:dash_match)
when '|'
@s.consume
create_token(:column)
else
create_token(:delim, :value => char)
end
when :~
if @s.peek == '='
@s.consume
create_token(:include_match)
else
create_token(:delim, :value => char)
end
else
case char
when RE_DIGIT
@s.reconsume
consume_numeric
when RE_NAME_START
@s.reconsume
consume_ident
else
create_token(:delim, :value => char)
end
end
end
# Consumes the remnants of a bad URL and returns the consumed text.
#
# 4.3.15. http://dev.w3.org/csswg/css-syntax/#consume-the-remnants-of-a-bad-url
def consume_bad_url
text = String.new
until @s.eos?
if valid_escape?
text << consume_escaped
elsif valid_escape?(@s.peek(2))
@s.consume
text << consume_escaped
else
char = @s.consume
if char == ')'
break
else
text << char
end
end
end
text
end
# Consumes comments and returns them, or `nil` if no comments were consumed.
#
# 4.3.2. http://dev.w3.org/csswg/css-syntax/#consume-comments
def consume_comments
if @s.peek(2) == '/*'
@s.consume
@s.consume
if text = @s.scan_until(RE_COMMENT_CLOSE)
text.slice!(-2, 2)
else
# Parse error.
text = @s.consume_rest
end
return create_token(:comment, :value => text)
end
nil
end
# Consumes an escaped code point and returns its unescaped value.
#
# This method assumes that the `\` has already been consumed, and that the
# next character in the input has already been verified not to be a newline
# or EOF.
#
# 4.3.8. http://dev.w3.org/csswg/css-syntax/#consume-an-escaped-code-point
def consume_escaped
return "\ufffd" if @s.eos?
if hex_str = @s.scan(RE_HEX)
@s.consume if @s.peek =~ RE_WHITESPACE
codepoint = hex_str.hex
if codepoint == 0 ||
codepoint.between?(0xD800, 0xDFFF) ||
codepoint > 0x10FFFF
return "\ufffd"
else
return codepoint.chr(Encoding::UTF_8)
end
end
@s.consume
end
# Consumes an ident-like token and returns it.
#
# 4.3.4. http://dev.w3.org/csswg/css-syntax/#consume-an-ident-like-token
def consume_ident
value = consume_name
if @s.peek == '('
@s.consume
if value.downcase == 'url'
@s.consume while @s.peek(2) =~ RE_WHITESPACE_ANCHORED
if @s.peek(2) =~ RE_QUOTED_URL_START
create_token(:function, :value => value)
else
consume_url
end
else
create_token(:function, :value => value)
end
else
create_token(:ident, :value => value)
end
end
# Consumes a name and returns it.
#
# 4.3.12. http://dev.w3.org/csswg/css-syntax/#consume-a-name
def consume_name
result = String.new
until @s.eos?
if match = @s.scan(RE_NAME)
result << match
next
end
char = @s.consume
if valid_escape?
result << consume_escaped
# Non-standard: IE * hack
elsif char == '*' && @options[:preserve_hacks]
result << @s.consume
else
@s.reconsume
return result
end
end
result
end
# Consumes a number and returns a 3-element array containing the number's
# original representation, its numeric value, and its type (either
# `:integer` or `:number`).
#
# 4.3.13. http://dev.w3.org/csswg/css-syntax/#consume-a-number
def consume_number
repr = String.new
type = :integer
repr << @s.consume if @s.peek =~ RE_NUMBER_SIGN
repr << (@s.scan(RE_DIGIT) || '')
if match = @s.scan(RE_NUMBER_DECIMAL)
repr << match
type = :number
end
if match = @s.scan(RE_NUMBER_EXPONENT)
repr << match
type = :number
end
[repr, convert_string_to_number(repr), type]
end
# Consumes a numeric token and returns it.
#
# 4.3.3. http://dev.w3.org/csswg/css-syntax/#consume-a-numeric-token
def consume_numeric
number = consume_number
repr = number[0]
value = number[1]
type = number[2]
if type == :integer
value = value.to_i
else
value = value.to_f
end
if start_identifier?(@s.peek(3))
create_token(:dimension,
:repr => repr,
:type => type,
:unit => consume_name,
:value => value)
elsif @s.peek == '%'
@s.consume
create_token(:percentage,
:repr => repr,
:type => type,
:value => value)
else
create_token(:number,
:repr => repr,
:type => type,
:value => value)
end
end
# Consumes a string token that ends at the given character, and returns the
# token.
#
# 4.3.5. http://dev.w3.org/csswg/css-syntax/#consume-a-string-token
def consume_string(ending = nil)
ending = @s.current if ending.nil?
value = String.new
until @s.eos?
case char = @s.consume
when ending
break
when "\n"
# Parse error.
@s.reconsume
return create_token(:bad_string,
:error => true,
:value => value)
when '\\'
case @s.peek
when ''
# End of the input, so do nothing.
next
when "\n"
@s.consume
else
value << consume_escaped
end
else
value << char
end
end
create_token(:string, :value => value)
end
# Consumes a Unicode range token and returns it. Assumes the initial "u+" or
# "U+" has already been consumed.
#
# 4.3.7. http://dev.w3.org/csswg/css-syntax/#consume-a-unicode-range-token
def consume_unicode_range
value = @s.scan(RE_HEX) || String.new
while value.length < 6
break unless @s.peek == '?'
value << @s.consume
end
range = {}
if value.include?('?')
range[:start] = value.gsub('?', '0').hex
range[:end] = value.gsub('?', 'F').hex
return create_token(:unicode_range, range)
end
range[:start] = value.hex
if @s.peek(2) =~ RE_UNICODE_RANGE_END
@s.consume
range[:end] = (@s.scan(RE_HEX) || '').hex
else
range[:end] = range[:start]
end
create_token(:unicode_range, range)
end
# Consumes a URL token and returns it. Assumes the original "url(" has
# already been consumed.
#
# 4.3.6. http://dev.w3.org/csswg/css-syntax/#consume-a-url-token
def consume_url
value = String.new
@s.scan(RE_WHITESPACE)
until @s.eos?
case char = @s.consume
when ')'
break
when RE_WHITESPACE
@s.scan(RE_WHITESPACE)
if @s.eos? || @s.peek == ')'
@s.consume
break
else
return create_token(:bad_url, :value => value + consume_bad_url)
end
when '"', "'", '(', RE_NON_PRINTABLE
# Parse error.
return create_token(:bad_url,
:error => true,
:value => value + consume_bad_url)
when '\\'
if valid_escape?
value << consume_escaped
else
# Parse error.
return create_token(:bad_url,
:error => true,
:value => value + consume_bad_url
)
end
else
value << char
end
end
create_token(:url, :value => value)
end
# Converts a valid CSS number string into a number and returns the number.
#
# 4.3.14. http://dev.w3.org/csswg/css-syntax/#convert-a-string-to-a-number
def convert_string_to_number(str)
matches = RE_NUMBER_STR.match(str)
s = matches[:sign] == '-' ? -1 : 1
i = matches[:integer].to_i
f = matches[:fractional].to_i
d = matches[:fractional] ? matches[:fractional].length : 0
t = matches[:exponent_sign] == '-' ? -1 : 1
e = matches[:exponent].to_i
# I know this formula looks nutty, but it's exactly what's defined in the
# spec, and it works.
value = s * (i + f * 10**-d) * 10**(t * e)
# Maximum and minimum values aren't defined in the spec, but are enforced
# here for sanity.
if value > Float::MAX
value = Float::MAX
elsif value < -Float::MAX
value = -Float::MAX
end
value
end
# Creates and returns a new token with the given _properties_.
def create_token(type, properties = {})
{
:node => type,
:pos => @s.marker,
:raw => @s.marked
}.merge!(properties)
end
# Preprocesses _input_ to prepare it for the tokenizer.
#
# 3.3. http://dev.w3.org/csswg/css-syntax/#input-preprocessing
def preprocess(input)
input = input.to_s.encode('UTF-8',
:invalid => :replace,
:undef => :replace)
input.gsub!(/(?:\r\n|[\r\f])/, "\n")
input.gsub!("\u0000", "\ufffd")
input
end
# Returns `true` if the given three-character _text_ would start an
# identifier. If _text_ is `nil`, the current and next two characters in the
# input stream will be checked, but will not be consumed.
#
# 4.3.10. http://dev.w3.org/csswg/css-syntax/#would-start-an-identifier
def start_identifier?(text = nil)
text = @s.current + @s.peek(2) if text.nil?
case text[0]
when '-'
nextChar = text[1]
!!(nextChar == '-' || nextChar =~ RE_NAME_START || valid_escape?(text[1, 2]))
when RE_NAME_START
true
when '\\'
valid_escape?(text[0, 2])
else
false
end
end
# Returns `true` if the given three-character _text_ would start a number.
# If _text_ is `nil`, the current and next two characters in the input
# stream will be checked, but will not be consumed.
#
# 4.3.11. http://dev.w3.org/csswg/css-syntax/#starts-with-a-number
def start_number?(text = nil)
text = @s.current + @s.peek(2) if text.nil?
case text[0]
when '+', '-'
!!(text[1] =~ RE_DIGIT || (text[1] == '.' && text[2] =~ RE_DIGIT))
when '.'
!!(text[1] =~ RE_DIGIT)
when RE_DIGIT
true
else
false
end
end
# Tokenizes the input stream and returns an array of tokens.
def tokenize
@s.reset
tokens = []
while token = consume
tokens << token
end
tokens
end
# Returns `true` if the given two-character _text_ is the beginning of a
# valid escape sequence. If _text_ is `nil`, the current and next character
# in the input stream will be checked, but will not be consumed.
#
# 4.3.9. http://dev.w3.org/csswg/css-syntax/#starts-with-a-valid-escape
def valid_escape?(text = nil)
text = @s.current + @s.peek if text.nil?
!!(text[0] == '\\' && text[1] != "\n")
end
end
end