#!/usr/bin/env ruby# The XChar library is provided courtesy of Sam Ruby (See# http://intertwingly.net/stories/2005/09/28/xchar.rb)# --------------------------------------------------------------------######################################################################moduleHpricot##################################################################### XML Character converter, from Sam Ruby:# (see http://intertwingly.net/stories/2005/09/28/xchar.rb).#moduleXChar# :nodoc:# See# http://intertwingly.net/stories/2004/04/14/i18n.html#CleaningWindows# for details.CP1252={# :nodoc:128=>8364,# euro sign130=>8218,# single low-9 quotation mark131=>402,# latin small letter f with hook132=>8222,# double low-9 quotation mark133=>8230,# horizontal ellipsis134=>8224,# dagger135=>8225,# double dagger136=>710,# modifier letter circumflex accent137=>8240,# per mille sign138=>352,# latin capital letter s with caron139=>8249,# single left-pointing angle quotation mark140=>338,# latin capital ligature oe142=>381,# latin capital letter z with caron145=>8216,# left single quotation mark146=>8217,# right single quotation mark147=>8220,# left double quotation mark148=>8221,# right double quotation mark149=>8226,# bullet150=>8211,# en dash151=>8212,# em dash152=>732,# small tilde153=>8482,# trade mark sign154=>353,# latin small letter s with caron155=>8250,# single right-pointing angle quotation mark156=>339,# latin small ligature oe158=>382,# latin small letter z with caron159=>376,# latin capital letter y with diaeresis}# See http://www.w3.org/TR/REC-xml/#dt-chardata for details.PREDEFINED={34=>'"',# quotation mark38=>'&',# ampersand60=>'<',# left angle bracket62=>'>'# right angle bracket}PREDEFINED_U=PREDEFINED.inject({}){|hsh,(k,v)|hsh[v]=k;hsh}# See http://www.w3.org/TR/REC-xml/#charsets for details.VALID=[0x9,0xA,0xD,(0x20..0xD7FF),(0xE000..0xFFFD),(0x10000..0x10FFFF)]endclass<<self# XML escaped version of chrdefxchr(str)n=XChar::CP1252[str]||strcasenwhen*XChar::VALIDXChar::PREDEFINED[n]or(n<128?n.chr:"&##{n};")else'*'endend# XML escaped version of to_sdefxs(str)str.to_s.unpack('U*').map{|n|xchr(n)}.join# ASCII, UTF-8rescuestr.to_s.unpack('C*').map{|n|xchr(n)}.join# ISO-8859-1, WIN-1252end# XML unescapedefuxs(str)str.to_s.gsub(/\&\w+;/){|x|(XChar::PREDEFINED_U[x]||63).chr}.# 63 = ?? (query char)gsub(/\&\#(\d+);/){[$1.to_i].pack("U*")}.gsub(/\&\#x([0-9a-fA-F]+);/){[$1.to_i(16)].pack("U*")}endendend