module IsoDoc
class I18n
def self.l10n(text, lang = @lang, script = @script, locale = @locale)
l10n(text, lang, script, locale)
end
# function localising spaces and punctuation.
def l10n(text, lang = @lang, script = @script, locale = @locale)
%w(zh ja ko).include?(lang) and text = l10n_zh(text, script)
lang == "fr" && text = l10n_fr(text, locale || "FR")
bidiwrap(text, lang, script)
end
def bidiwrap(text, lang, script)
my_script, my_rtl, outer_rtl = bidiwrap_vars(lang, script)
if my_rtl && !outer_rtl
mark = %w(Arab Aran).include?(my_script) ? "؜" : "‏"
"#{mark}#{text}#{mark}"
elsif !my_rtl && outer_rtl then "‎#{text}‎"
else text
end
end
def bidiwrap_vars(lang, script)
my_script = script || Metanorma::Utils.default_script(lang)
[my_script,
Metanorma::Utils.rtl_script?(my_script),
Metanorma::Utils.rtl_script?(@script || Metanorma::Utils
.default_script(@lang))]
end
# CJK
def l10n_zh(text, script = "Hans")
xml = Nokogiri::XML::DocumentFragment.parse(text)
t = xml.xpath(".//text()")
t.each_with_index do |n, i|
prev, foll = l10n_context(t, i)
text = cleanup_entities(n.text, is_xml: false)
n.replace(l10_zh1(text, prev, foll, script))
end
xml.to_xml(encoding: "UTF-8").gsub(/<b>/, "").gsub("</b>", "")
.gsub(/<\?[^>]+>/, "")
end
# previous, following context of current text node:
# do not use just the immediately adjoining text tokens for context
# deal with spaces and empty text by just concatenating entire context
def l10n_context(nodes, idx)
prev = nodes[0...idx].map(&:text).join
foll = nodes[(idx + 1)...(nodes.size)].map(&:text).join
[prev, foll]
end
def l10n_fr(text, locale)
xml = Nokogiri::XML::DocumentFragment.parse(text)
t = xml.xpath(".//text()")
t.each_with_index do |n, i|
prev, foll = l10n_context(t, i)
text = cleanup_entities(n.text, is_xml: false)
n.replace(l10n_fr1(text, prev, foll, locale))
end
xml.to_xml(encoding: "UTF-8")
end
ZH_CHAR = "(\\p{Han}|\\p{In CJK Symbols And Punctuation}|" \
"\\p{In Halfwidth And Fullwidth Forms})".freeze
# note: we can't differentiate comma from enumeration comma 、
# def l10_zh1(text, _script)
def l10_zh1(text, prev, foll, _script)
# l10n_zh_dash(l10n_zh_remove_space(l10n_zh_punct(text)))
r = l10n_zh_punct(text, prev, foll)
r = l10n_zh_remove_space(r, prev, foll)
l10n_zh_dash(r, prev, foll)
end
ZH1_PUNCT = /(#{ZH_CHAR}|^) # CJK character, or start of string
(\s*)$ # Latin spaces optional
/xo.freeze
ZH2_PUNCT = /^\s* # followed by ignorable Latin spaces
[:,.()\[\];?!-]* # Latin punct which will also convert to CJK
(#{ZH_CHAR}|$) # CJK character, or end of string
/xo.freeze
# CJK punct if (^|CJK).($|CJK)
def l10n_zh_punct(text, prev, foll)
["::", ",,", "..", "))", "]]", ";;", "??", "!!", "((", "[["].each do |m|
text = l10n_gsub(text, prev, foll, [m[0], m[1]],
[ZH1_PUNCT, ZH2_PUNCT])
end
text
end
ZH1_DASH = /(#{ZH_CHAR}|^) # CJK character, or start of string
(\d*) # optional digits
$/xo.freeze
ZH2_DASH = /^\d* # followed by optional digits
(#{ZH_CHAR}|$) # CJK character, or end of string
/xo.freeze
def l10n_zh_dash(text, prev, foll)
l10n_gsub(text, prev, foll, %w(– ~), [ZH1_DASH, ZH2_DASH])
end
def l10n_gsub(text, prev, foll, delim, regex)
context = l10n_gsub_context(text, prev, foll, delim) or return text
(1...(context.size - 1)).each do |i|
l10_context_valid?(context, i, delim, regex) and
context[i] = delim[1].gsub("\\0", context[i]) # Full-width equivalent
end
context[1...(context.size - 1)].join
end
def l10n_gsub_context(text, prev, foll, delim)
d = delim[0].is_a?(Regexp) ? delim[0] : Regexp.quote(delim[0])
context = text.split(/(#{d})/) # delim to replace
context.size == 1 and return
[prev, context, foll].flatten
end
def l10_context_valid?(context, idx, delim, regex)
found_delim = if delim[0].is_a?(Regexp) # punct to convert
delim[0].match?(context[idx])
else
context[idx] == delim[0]
end
found_delim &&
regex[0].match?(context[0...idx].join) && # preceding context
regex[1].match?(context[(idx + 1)..-1].join) # foll context
end
def l10n_zh_remove_space(text, prev, foll)
text = l10n_gsub(text, prev, foll, [" ", ""],
[/(#{ZH_CHAR}|\d)$/o, /^#{ZH_CHAR}/o])
l10n_gsub(text, prev, foll, [" ", ""],
[/#{ZH_CHAR}$/o, /^(\d|[A-Za-z](#{ZH_CHAR}|$))/o])
end
def l10n_fr1(text, prev, foll, locale)
text = l10n_gsub(text, prev, foll, [/[»›;?!]/, "\u202f\\0"],
[/\p{Alnum}$/, /^(\s|$)/])
text = l10n_gsub(text, prev, foll, [/[«‹]/, "\\0\u202f"], [/$/, /^./])
colonsp = locale == "CH" ? "\u202f" : "\u00a0"
l10n_gsub(text, prev, foll, [":", "#{colonsp}\\0"],
[/\p{Alnum}$/, /^(\s|$)/])
end
def self.cjk_extend(text)
cjk_extend(text)
end
def cjk_extend(title)
@c.decode(title).chars.map.with_index do |n, i|
if i.zero? || !interleave_space_cjk?(title[i - 1] + title[i])
n
else "\u3000#{n}"
end
end.join
end
def interleave_space_cjk?(text)
text.size == 2 or return
["\u2014\u2014", "\u2025\u2025", "\u2026\u2026",
"\u22ef\u22ef"].include?(text) ||
/\d\d|\p{Latin}\p{Latin}|[[:space:]]/.match?(text) ||
/^[\u2018\u201c(\u3014\[{\u3008\u300a\u300c\u300e\u3010\u2985\u3018\u3016\u00ab\u301d]/.match?(text) ||
/[\u2019\u201d)\u3015\]}\u3009\u300b\u300d\u300f\u3011\u2986\u3019\u3017\u00bb\u301f]$/.match?(text) ||
/[\u3002.\u3001,\u30fb:;\u2010\u301c\u30a0\u2013!?\u203c\u2047\u2048\u2049]/.match?(text) and return false
true
end
end
end