module Sterile
def codepoints_data
Lazy load codepoints data
def codepoints_data @codepoints_data ||= begin require "sterile/data/codepoints_data" Data.codepoints_data end end
def decode_entities(string)
their Unicode counterparts.
The reverse of +encode_entities+. Turns HTML or numeric entities into
def decode_entities(string) string.gsub!(/&#x([a-zA-Z0-9]{1,7});/) { [$1.to_i(16)].pack("U") } string.gsub!(/&#(\d{1,7});/) { [$1.to_i].pack("U") } string.gsub(/&([a-zA-Z0-9]+);/) do codepoint = html_entities_data[$1] codepoint ? [codepoint].pack("U") : $& end end
def encode_entities(string)
q{“Economy Hits Bottom,” ran the headline}.encode_entities # => “Economy Hits Bottom,” ran the headline
If a valid HTML entity is not possible, it will create a numeric entity.
Turn Unicode characters into their HTML equivilents.
def encode_entities(string) transmogrify(string) do |mapping, codepoint| if (32..126).include?(codepoint) mapping[0] else "&" + (mapping[2] || "#" + codepoint.to_s) + ";" end end end
def gsub_tags(string, &block)
Warning: does not work in some degenerate cases.
returns.
yields text to a block. Text will be replaced by what the block
Similar to +gsub+, except it works in between HTML/XML tags and
def gsub_tags(string, &block) raise "No block given" unless block_given? fragment = Nokogiri::HTML::DocumentFragment.parse string fragment.traverse do |node| node.content = yield(node.content) if node.text? end fragment.to_html end
def html_entities_data
Lazy load html entities
def html_entities_data @html_entities_data ||= begin require "sterile/data/html_entities_data" Data.html_entities_data end end
def plain_format(string)
def plain_format(string) string = string.encode_entities plain_format_rules.each do |rule| string.gsub! rule[0], rule[1] end string end
def plain_format_rules
Lazy load plain formatting rules
def plain_format_rules @plain_format_rules ||= begin require "sterile/data/plain_format_rules" Data.plain_format_rules end end
def plain_format_tags(string)
Like +plain_format+, but works with HTML/XML (somewhat).
def plain_format_tags(string) string.gsub_tags do |text| text.plain_format.decode_entities end.encode_entities end
def scan_tags(string, &block)
Warning: does not work in some degenerate cases.
it to a block.
Iterates over all text in between HTML/XML tags and yields
def scan_tags(string, &block) raise "No block given" unless block_given? fragment = Nokogiri::HTML::DocumentFragment.parse string fragment.traverse do |node| yield(node.content) if node.text? end nil end
def sluggerize(string, options = {})
"Hello World!".sluggerize # => "hello-world"
with a delimiter (defaults to '-').
by stripping out all non-alphanumeric characters and replacing spaces
Transliterate to ASCII, downcase and format for URL permalink/slug
def sluggerize(string, options = {}) options = { :delimiter => "-" }.merge!(options) sterilize(string).strip.gsub(/\s+/, "-").gsub(/[^a-zA-Z0-9\-]/, "").gsub(/-+/, options[:delimiter]).downcase end
def smart_format(string)
q{"He said, 'Away with you, Drake!'"}.smart_format # => “He said, ‘Away with you, Drake!’”
Format text with proper "curly" quotes, m-dashes, copyright, trademark, etc.
def smart_format(string) string = string.to_s string = string.dup if string.frozen? smart_format_rules.each do |rule| string.gsub! rule[0], rule[1] end string end
def smart_format_rules
Lazy load smart formatting rules
def smart_format_rules @smart_format_rules ||= begin require "sterile/data/smart_format_rules" Data.smart_format_rules end end
def smart_format_tags(string)
Like +smart_format+, but works with HTML/XML (somewhat).
def smart_format_tags(string) string = string.gsub(/[\p{Z}\s]+(<\/[a-zA-Z]+>)(['"][a-zA-Z])/, "\\1 \\2") # Fixes quote after whitespace + tag "<em>Dan. </em>'And" string.gsub_tags do |text| text.smart_format end.encode_entities.gsub(/(\<\/\w+\>)“/, "\\1”").gsub(/(\<\/\w+\>)‘/, "\\1’") end
def sterilize(string)
"nåsty".sterilize # => "nasty"
Transliterate to ASCII and strip out any HTML/XML tags.
def sterilize(string) strip_tags(transliterate(string)) end
def strip_tags(string, options = {})
Redundant whitespace will be removed unless :keep_whitespace => true is specified.
CDATA is considered text unless :keep_cdata => false is specified.
Remove HTML/XML tags from text. Also strips out comments, PHP and ERB style tags.
def strip_tags(string, options = {}) options = { :keep_whitespace => false, :keep_cdata => true }.merge!(options) string.gsub!(/<[%?](php)?[^>]*>/, '') # strip php, erb et al string.gsub!(/<!--[^-]*-->/, '') # strip comments string.gsub!( / <!\[CDATA\[ ([^\]]*) \]\]> /xi, options[:keep_cdata] ? '\\1' : '' ) html_name = /[\w:-]+/ html_data = /([A-Za-z0-9]+|('[^']*?'|"[^"]*?"))/ html_attr = /(#{html_name}(\s*=\s*#{html_data})?)/ string.gsub!( / < [\/]? #{html_name} (\s+(#{html_attr}(\s+#{html_attr})*))? \s* [\/]? > /xi, '' ) options[:keep_whitespace] ? string : trim_whitespace(string) end
def titlecase(string)
by John Gruber et al (http://daringfireball.net/2008/08/title_case_update)
than ActiveSupport's +titlecase+. The algorithm is based on work done
Format text appropriately for titles. This method is much smarter
def titlecase(string) lsquo = [8216].pack("U") rsquo = [8217].pack("U") ldquo = [8220].pack("U") rdquo = [8221].pack("U") ndash = [8211].pack("U") string.strip! string.gsub!(/\s+/, " ") string.downcase! unless string =~ /[[:lower:]]/ small_words = %w{ a an and as at(?!&t) but by en for if in nor of on or the to v[.]? via vs[.]? }.join("|") apos = / (?: ['#{rsquo}] [[:lower:]]* )? /xu string.gsub!( / \b ([_\*]*) (?: ( [-\+\w]+ [@.\:\/] [-\w@.\:\/]+ #{apos} ) # URL, domain, or email | ( (?i: #{small_words} ) #{apos} ) # or small word, case-insensitive | ( [[:alpha:]] [[:lower:]'#{rsquo}()\[\]{}]* #{apos} ) # or word without internal caps | ( [[:alpha:]] [[:alpha:]'#{rsquo}()\[\]{}]* #{apos} ) # or some other word ) ([_\*]*) \b /xu ) do ($1 ? $1 : "") + ($2 ? $2 : ($3 ? $3.downcase : ($4 ? $4.downcase.capitalize : $5))) + ($6 ? $6 : "") end if RUBY_VERSION < "1.9.0" string.gsub!( / \b ([:alpha:]+) (#{ndash}) ([:alpha:]+) \b /xu ) do $1.downcase.capitalize + $2 + $1.downcase.capitalize end end string.gsub!( / ( \A [[:punct:]]* # start of title | [:.;?!][ ]+ # or of subsentence | [ ]['"#{ldquo}#{lsquo}(\[][ ]* # or of inserted subphrase ) ( #{small_words} ) # followed by a small-word \b /xiu ) do $1 + $2.downcase.capitalize end string.gsub!( / \b ( #{small_words} ) # small-word (?= [[:punct:]]* \Z # at the end of the title | ['"#{rsquo}#{rdquo})\]] [ ] # or of an inserted subphrase ) /xu ) do $1.downcase.capitalize end string.gsub!( / ( \b [[:alpha:]] # single first letter [\-#{ndash}] # followed by a dash ) ( [[:alpha:]] ) # followed by a letter /xu ) do $1 + $2.downcase end string.gsub!(/q&a/i, 'Q&A') string end
def transliterate(string, options = {})
"ýůçký".transliterate # => "yucky"
of more pedantic matches.
Passing an option of :optical => true will prefer optical mapping instead
by Eric Boehs at https://github.com/ericboehs/to_slug
superior results to iconv. The optical conversion data is based on work
which is in turn a port of Perl's Unidecode and ostensibly provides
ASCII equivalents. This is based on data from the stringex gem (https://github.com/rsl/stringex)
Transliterate Unicode [and accented ASCII] characters to their plain-text
def transliterate(string, options = {}) options = { :optical => false }.merge!(options) if options[:optical] transmogrify(string) do |mapping, codepoint| mapping[1] || mapping[0] || "" end else transmogrify(string) do |mapping, codepoint| mapping[0] || mapping[1] || "" end end end
def transmogrify(string, &block)
def transmogrify(string, &block) raise "No block given" unless block_given? result = "" string.unpack("U*").each do |codepoint| cg = codepoint >> 8 cp = codepoint & 0xFF begin mapping = Array(codepoints_data[cg][cp]) result << yield(mapping, codepoint) rescue end end result end
def trim_whitespace(string)
" Hello world! ".transliterate # => "Hello world!"
whitespace in between.
Trim whitespace from start and end of string and remove any redundant
def trim_whitespace(string) string.gsub(/\s+/, " ").strip end