module Sterile

def codepoints_data


Lazy load codepoints data
def codepoints_data
  @codepoints_data ||= begin
    require "sterile/data/codepoints_data"
    Data.codepoints_data
  end
end

def decode_entities(string)


their Unicode counterparts.
The reverse of +encode_entities+. Turns HTML or numeric entities into
def decode_entities(string)
  string.gsub!(/&#x([a-zA-Z0-9]{1,7});/) { [$1.to_i(16)].pack("U") }
  string.gsub!(/&#(\d{1,7});/) { [$1.to_i].pack("U") }
  string.gsub(/&([a-zA-Z0-9]+);/) do
    codepoint = html_entities_data[$1]
    codepoint ? [codepoint].pack("U") : $&
  end
end

def encode_entities(string)


q{“Economy Hits Bottom,” ran the headline}.encode_entities # => “Economy Hits Bottom,” ran the headline

If a valid HTML entity is not possible, it will create a numeric entity.
Turn Unicode characters into their HTML equivilents.
def encode_entities(string)
  transmogrify(string) do |mapping, codepoint|
    if (32..126).include?(codepoint)
      mapping[0]
    else
      "&" + (mapping[2] || "#" + codepoint.to_s) + ";"
    end
  end
end

def gsub_tags(string, &block)


Warning: does not work in some degenerate cases.
returns.
yields text to a block. Text will be replaced by what the block
Similar to +gsub+, except it works in between HTML/XML tags and
def gsub_tags(string, &block)
  raise "No block given" unless block_given?
  fragment = Nokogiri::HTML::DocumentFragment.parse string
  fragment.traverse do |node|
    node.content = yield(node.content) if node.text?
  end
  fragment.to_html
end

def html_entities_data


Lazy load html entities
def html_entities_data
  @html_entities_data ||= begin
    require "sterile/data/html_entities_data"
    Data.html_entities_data
  end
end

def plain_format(string)

def plain_format(string)
  string = string.encode_entities
  plain_format_rules.each do |rule|
    string.gsub! rule[0], rule[1]
  end
  string
end

def plain_format_rules


Lazy load plain formatting rules
def plain_format_rules
  @plain_format_rules ||= begin
    require "sterile/data/plain_format_rules"
    Data.plain_format_rules
  end
end

def plain_format_tags(string)


Like +plain_format+, but works with HTML/XML (somewhat).
def plain_format_tags(string)
  string.gsub_tags do |text|
    text.plain_format.decode_entities
  end.encode_entities
end

def scan_tags(string, &block)


Warning: does not work in some degenerate cases.
it to a block.
Iterates over all text in between HTML/XML tags and yields
def scan_tags(string, &block)
  raise "No block given" unless block_given?
  fragment = Nokogiri::HTML::DocumentFragment.parse string
  fragment.traverse do |node|
    yield(node.content) if node.text?
  end
  nil
end

def sluggerize(string, options = {})


"Hello World!".sluggerize # => "hello-world"

with a delimiter (defaults to '-').
by stripping out all non-alphanumeric characters and replacing spaces
Transliterate to ASCII, downcase and format for URL permalink/slug
def sluggerize(string, options = {})
  options = {
    :delimiter => "-"
  }.merge!(options)
  sterilize(string).strip.gsub(/\s+/, "-").gsub(/[^a-zA-Z0-9\-]/, "").gsub(/-+/, options[:delimiter]).downcase
end

def smart_format(string)


q{"He said, 'Away with you, Drake!'"}.smart_format # => “He said, ‘Away with you, Drake!’”

Format text with proper "curly" quotes, m-dashes, copyright, trademark, etc.
def smart_format(string)
  string = string.to_s
  string = string.dup if string.frozen?
  smart_format_rules.each do |rule|
    string.gsub! rule[0], rule[1]
  end
  string
end

def smart_format_rules


Lazy load smart formatting rules
def smart_format_rules
  @smart_format_rules ||= begin
    require "sterile/data/smart_format_rules"
    Data.smart_format_rules
  end
end

def smart_format_tags(string)


Like +smart_format+, but works with HTML/XML (somewhat).
def smart_format_tags(string)
  string = string.gsub(/[\p{Z}\s]+(<\/[a-zA-Z]+>)(['"][a-zA-Z])/, "\\1 \\2") # Fixes quote after whitespace + tag "<em>Dan. </em>'And"
  string.gsub_tags do |text|
    text.smart_format
  end.encode_entities.gsub(/(\<\/\w+\>)&ldquo;/, "\\1&rdquo;").gsub(/(\<\/\w+\>)&lsquo;/, "\\1&rsquo;")
end

def sterilize(string)


"nåsty".sterilize # => "nasty"

Transliterate to ASCII and strip out any HTML/XML tags.
def sterilize(string)
  strip_tags(transliterate(string))
end

def strip_tags(string, options = {})


Redundant whitespace will be removed unless :keep_whitespace => true is specified.
CDATA is considered text unless :keep_cdata => false is specified.
Remove HTML/XML tags from text. Also strips out comments, PHP and ERB style tags.
def strip_tags(string, options = {})
  options = {
    :keep_whitespace => false,
    :keep_cdata      => true
  }.merge!(options)
  string.gsub!(/<[%?](php)?[^>]*>/, '') # strip php, erb et al
  string.gsub!(/<!--[^-]*-->/, '')      # strip comments
  string.gsub!(
    /
      <!\[CDATA\[
      ([^\]]*)
      \]\]>
    /xi,
    options[:keep_cdata] ? '\\1' : ''
  )
  html_name = /[\w:-]+/
  html_data = /([A-Za-z0-9]+|('[^']*?'|"[^"]*?"))/
  html_attr = /(#{html_name}(\s*=\s*#{html_data})?)/
  string.gsub!(
    /
      <
      [\/]?
      #{html_name}
      (\s+(#{html_attr}(\s+#{html_attr})*))?
      \s*
      [\/]?
      >
    /xi,
    ''
  )
  options[:keep_whitespace] ? string : trim_whitespace(string)
end

def titlecase(string)


by John Gruber et al (http://daringfireball.net/2008/08/title_case_update)
than ActiveSupport's +titlecase+. The algorithm is based on work done
Format text appropriately for titles. This method is much smarter
def titlecase(string)
  lsquo = [8216].pack("U")
  rsquo = [8217].pack("U")
  ldquo = [8220].pack("U")
  rdquo = [8221].pack("U")
  ndash = [8211].pack("U")
  string.strip!
  string.gsub!(/\s+/, " ")
  string.downcase! unless string =~ /[[:lower:]]/
  small_words = %w{ a an and as at(?!&t) but by en for if in nor of on or the to v[.]? via vs[.]? }.join("|")
  apos = / (?: ['#{rsquo}] [[:lower:]]* )? /xu
  string.gsub!(
    /
      \b
      ([_\*]*)
      (?:
        ( [-\+\w]+ [@.\:\/] [-\w@.\:\/]+ #{apos} )      # URL, domain, or email
        |
        ( (?i: #{small_words} ) #{apos} )               # or small word, case-insensitive
        |
        ( [[:alpha:]] [[:lower:]'#{rsquo}()\[\]{}]* #{apos} )  # or word without internal caps
        |
        ( [[:alpha:]] [[:alpha:]'#{rsquo}()\[\]{}]* #{apos} )  # or some other word
      )
      ([_\*]*)
      \b
    /xu
  ) do
    ($1 ? $1 : "") +
    ($2 ? $2 : ($3 ? $3.downcase : ($4 ? $4.downcase.capitalize : $5))) +
    ($6 ? $6 : "")
  end
  if RUBY_VERSION < "1.9.0"
    string.gsub!(
      /
        \b
        ([:alpha:]+)
        (#{ndash})
        ([:alpha:]+)
        \b
      /xu
    ) do
      $1.downcase.capitalize + $2 + $1.downcase.capitalize
    end
  end
  string.gsub!(
    /
      (
        \A [[:punct:]]*     # start of title
        | [:.;?!][ ]+       # or of subsentence
        | [ ]['"#{ldquo}#{lsquo}(\[][ ]*  # or of inserted subphrase
      )
      ( #{small_words} )    # followed by a small-word
      \b
    /xiu
  ) do
    $1 + $2.downcase.capitalize
  end
  string.gsub!(
    /
      \b
      ( #{small_words} )    # small-word
      (?=
        [[:punct:]]* \Z     # at the end of the title
        |
        ['"#{rsquo}#{rdquo})\]] [ ]       # or of an inserted subphrase
      )
    /xu
  ) do
    $1.downcase.capitalize
  end
  string.gsub!(
    /
      (
        \b
        [[:alpha:]]         # single first letter
        [\-#{ndash}]               # followed by a dash
      )
      ( [[:alpha:]] )       # followed by a letter
    /xu
  ) do
    $1 + $2.downcase
  end
  string.gsub!(/q&a/i, 'Q&A')
  string
end

def transliterate(string, options = {})


"ýůçký".transliterate # => "yucky"

of more pedantic matches.
Passing an option of :optical => true will prefer optical mapping instead
by Eric Boehs at https://github.com/ericboehs/to_slug
superior results to iconv. The optical conversion data is based on work
which is in turn a port of Perl's Unidecode and ostensibly provides
ASCII equivalents. This is based on data from the stringex gem (https://github.com/rsl/stringex)
Transliterate Unicode [and accented ASCII] characters to their plain-text
def transliterate(string, options = {})
  options = {
    :optical => false
  }.merge!(options)
  if options[:optical]
    transmogrify(string) do |mapping, codepoint|
      mapping[1] || mapping[0] || ""
    end
  else
    transmogrify(string) do |mapping, codepoint|
      mapping[0] || mapping[1] || ""
    end
  end
end

def transmogrify(string, &block)

def transmogrify(string, &block)
  raise "No block given" unless block_given?
  result = ""
  string.unpack("U*").each do |codepoint|
    cg = codepoint >> 8
    cp = codepoint & 0xFF
    begin
      mapping = Array(codepoints_data[cg][cp])
      result << yield(mapping, codepoint)
    rescue
    end
  end
  result
end

def trim_whitespace(string)


" Hello world! ".transliterate # => "Hello world!"

whitespace in between.
Trim whitespace from start and end of string and remove any redundant
def trim_whitespace(string)
  string.gsub(/\s+/, " ").strip
end