မော်ဂျူး:ja-translit
Documentation for this module may be created at မော်ဂျူး:ja-translit/doc
local export = {}
local data_common = mw.loadData'Module:ja-translit/data'
local c_apos = data_common.rom['っ']
local function get_data(lang_name)
local function inspect_table(t, ...)
for i = 1, select('#', ...) do
if type(t) == 'table' then
t = t[select(i, ...)]
else return nil end
end
return t
end
if lang_name then
local name_data = 'Module:ja-translit/data/' .. lang_name
if package.loaders[2](name_data) then
local data_lang = mw.loadData(name_data)
return function(...)
local item_lang, item_common = data_lang[...], data_common[...]
for i = 2, select('#', ...) do
local key = select(i, ...)
if type(item_lang) == 'table' then
item_lang = item_lang[key]
else return inspect_table(item_common, select(i, ...)) end
if type(item_common) == 'table' then
item_common = item_common[key]
else return inspect_table(item_lang, select(i + 1, ...)) end
end
if item_lang ~= nil then return item_lang else return item_common end
end
end
end
return function(...)
return inspect_table(data_common[...], select(2, ...))
end
end
function export.kana_to_romaji(text, options)
options = options or {}
local result = {[0] = ''}
local result_sp = {}
local d = get_data(options.language_name)
local function getlast(i_start, predicate_good, predicate_bad)
local in_xml = false
for i = i_start or #result, 1, -1 do
if in_xml then
if result[i] == '<' then in_xml = false end
elseif result[i] == '>' then
in_xml = true
else
if (predicate_bad or function(index)
return result_sp[index] == 'stop'
end)(i) then break end
if (predicate_good or function(index)
return result[index]:len() > 0 and result_sp[index] ~= '\''
end)(i) then return i end
end
end
return 0
end
for c in mw.ustring.gsub(mw.ustring.gsub(text, '()([ゝヽゞヾ]+)', function(p1, m2) -- repetition mark
local len = mw.ustring.len(m2)
local sec_rep = mw.ustring.sub(text, p1 - len, p1 - 1)
for i = len, 1, -1 do
if ({['ゞ'] = true, ['ヾ'] = true})[mw.ustring.sub(m2, i, i)] then
sec_rep = mw.ustring.sub(sec_rep, 1, i) .. '゙' .. mw.ustring.sub(sec_rep, i + 1)
end
end
return sec_rep
end), '[ァ-ヶ]', function(m1) -- kata to hira
return mw.ustring.char(mw.ustring.codepoint(m1) - 96)
end):gsub("\227\130[\144-\146]゙", {
-- convert わ゙, ゐ゙, ゑ゙, を゙ to ヷ, ヸ, ヹ, ヺ, to ensure voicing works correctly
['ゐ゙'] = 'ヸ', ['ゑ゙'] = 'ヹ', ['を゙'] = 'ヺ',
}):gmatch'.[\128-\191]*' do
local rc = options.hist and d('rom_hist', c) or d('rom', c) or c
local rc_sp = d('rom_sp', c)
local i_last = getlast()
if options.keep_period and c == '.' then rc = '.'
elseif c:match'%a' then rc_sp = 'stop' end
local repl_digraph = d('digraph', c, result[i_last])
if repl_digraph then
result[i_last], rc = repl_digraph, ''
result_sp[i_last], rc_sp = nil, nil
end
if not options.hist then --はへ
if d('flag_hahe', result_sp[i_last]) and (mw.ustring.match(c, '[-%.ー゙゚]') or rc:match'%a' or rc == c_apos) then
result[i_last] = result_sp[i_last]
result_sp[i_last] = nil
end
if d('flag_hahe', rc_sp) and (options.phonetic or result_sp[getlast(nil, function(i)
return result[i]:len() > 0 and result_sp[i] ~= '\'' or result_sp[i] == 'stop'
end, function(i) return false end)] == 'stop' or result[i_last]:match'[-%a]' or result[i_last] == c_apos) then
rc = rc_sp
rc_sp = nil
end
end
if rc:match'%a' and mw.ustring.match(result[i_last], '^[,%.?!:)Ӡ]$') then --space and punctuations
result[i_last] = result[i_last] .. ' '
elseif mw.ustring.match(rc, '^[(“]$') and result[i_last]:match'%a' then
rc = ' ' .. rc
end
if rc_sp == 'voiced' then --voicing
result[i_last] = result[i_last]:gsub('^[b-df-hj-np-tv-z]+', d('tr_voicing'))
elseif rc_sp == 'semivoiced' then
result[i_last] = result[i_last]:gsub('^[b-df-hj-np-tv-z]+', d('tr_semivoicing'))
end
if result[i_last] == 'n' and rc:match(options.hist and '^[aiueoyw]' or '^[aiueoy]') then --na vs n'a
rc = c_apos .. rc
end
local r_lastlast = result[i_last]:match'^.*(%a%A*)$' --vowel clusters or stop consonants
if r_lastlast then
if c == 'ー' then
result[i_last] = result[i_last] .. r_lastlast
elseif r_lastlast:match("[aiueo]") then
if rc:match'^%-[yw]' or options.hist and (r_lastlast == 'i' and rc:sub(1, 1) == 'y' or r_lastlast == 'u' and rc:sub(1, 1) == 'w') then
if rc:sub(1, 1) == '-' then rc = rc:sub(2) end
result[i_last] = result[i_last]:sub(1, -2)
if rc:sub(1, 1) == 'y' and d('flag_postalveolarconsonant', result[i_last]) then rc = rc:sub(2) end
elseif rc:match'^%-[aiueo]$' then
rc = rc:sub(2)
if r_lastlast == rc then
result[i_last] = result[i_last] .. r_lastlast
rc = ''
elseif d('flag_specialconsonant', result[i_last]) then
result[i_last] = result[i_last]:sub(1, -2)
elseif r_lastlast == 'i' then
result[i_last] = result[i_last]:sub(1, -2) .. 'y'
elseif r_lastlast:match'[ou]' and rc ~= 'u' then
result[i_last] = result[i_last]:sub(1, -2) .. 'w'
else
result[i_last] = result[i_last]:sub(1, -2)
end
elseif rc:match'^[aiueo]$' then
if not options.hist and not options.phonetic and d('tr_long', r_lastlast .. rc) and not result[i_last]:match'[aiueo][aiueo]$' then
result[i_last] = result[i_last] .. rc
rc = ''
end
end
end
end
table.insert(result, rc)
result_sp[#result] = rc_sp
end
if not options.hist then --isolated はへ
local i_last = getlast()
if d('flag_hahe', result_sp[i_last]) and getlast(i_last - 1) == 0 then
result[i_last] = result_sp[i_last]
end
end
local num_cap = 0
local has_gem = false
for i, v in ipairs(result) do
--gemination
if has_gem then
local apos, consonant, remainder = v:match('^(' .. c_apos .. '*)([b-df-hj-np-tv-z]+)(.*)')
if consonant then
local c_gem = d('tr_gem', apos .. consonant) or consonant:sub(1, 1)
v = consonant .. remainder
local i_gem = getlast(i)
while true do
i_gem = getlast(i_gem - 1)
if result_sp[i_gem] == 'gem' then
result[i_gem] = c_gem
else
i_gem = getlast(i_gem + 1)
result[i_gem] = apos .. result[i_gem]
break
end
end
has_gem = false
end
elseif result_sp[i] == 'gem' then
has_gem = true
end
-- anga vs a'nga
if v:match'^ng' then
local i_no_gem = getlast(i - 1, function(index)
return result[index]:len() > 0 and result_sp[index] ~= '\'' and result_sp[index] ~= 'gem'
end)
if mw.ustring.match(result[i_no_gem], '%a') then
result[i_no_gem] = result[i_no_gem] .. c_apos
end
end
--diacritics (long vowels and others)
if not options.no_diacritics then
v = v:gsub('[aiueo][aiueo%A]*', d('tr_long'))
end
--uppercase
if result_sp[i] == 'cap' then num_cap = num_cap + 1 end
if num_cap > 0 then
v = v:gsub('.[\128-\191]*', function(c)
if num_cap <= 0 then return c end
local uc = mw.ustring.upper(c)
if c ~= uc then num_cap = num_cap - 1 end
return uc
end)
end
result[i] = v
end
return (table.concat(result):gsub(c_apos, "'"))
end
local function is_good_romaji(str)
str = mw.ustring.gsub(str, '%A', '')
return mw.ustring.match(str, '[^A-za-zĀĪŪĒŌāīūēō]') == nil
end
local function format_pos_romaji(rom, pos)
if pos == 'proper' then
if mw.ustring.gmatch(rom, '%u') then return rom end
return (mw.ustring.gsub(rom, '%f[%a]%a', mw.ustring.upper))
elseif pos == 'prefix' then
return (rom:gsub('%-?$', '-'))
elseif pos == 'suffix' or pos == 'counter' or pos == 'classifier' then
return (rom:gsub('^%-?', '-'))
else
return rom
end
end
function export.tr(text, lang, sc)
local options = { language_name = lang }
local rom_result
local rom_title = export.kana_to_romaji(text, options)
if not is_good_romaji(rom_title) then rom_title = nil end
local pagetext = mw.title.new(text):getContent()
if pagetext then
for _, tn in ipairs{'noun', 'verb', 'verb%-suru', 'adj', 'phrase', 'combining form', 'verb form', 'see'} do
if rom_title and pagetext:match('{{ja%-' .. tn .. '}}') then
if rom_result and rom_result ~= rom_title then return rom_title end
rom_result = rom_title
end
for t in pagetext:gmatch('{{ja%-' .. tn .. '(|..-})}') do
local no_kana = true
for tt in t:gmatch'%f[^|]..-%f[|}]' do
if not tt:match'%D.*=' and not tt:match'%[%[' and not tt:match']]' then
local rom = export.kana_to_romaji(tt, options)
if is_good_romaji(rom) then
no_kana = false
if rom_result and rom_result ~= rom then return rom_title end
rom_result = rom
end
end
end
if rom_title and no_kana then
if rom_result and rom_result ~= rom_title then return rom_title end
rom_result = rom_title
end
end
end
for t in pagetext:gmatch'{{ja%-pos|(..-})}' do
local pos, ta = t:match'^(..-)(|..-})$'
if ta then
local no_kana = true
for tt in ta:gmatch'%f[^|]..-%f[|}]' do
if not tt:match'%D.*=' and not tt:match'%[%[' and not tt:match']]' then
local rom = export.kana_to_romaji(tt, options)
if is_good_romaji(rom) then
no_kana = false
rom = format_pos_romaji(rom, pos)
if rom_result and rom_result ~= rom then return rom_title end
rom_result = rom
end
end
end
if rom_title and no_kana then
local rom = format_pos_romaji(rom_title, pos)
if rom_result and rom_result ~= rom then return rom_title end
rom_result = rom
end
elseif rom_title then
local rom = format_pos_romaji(rom_title, t:sub(1, -2))
if rom_result and rom_result ~= rom then return rom_title end
rom_result = rom
end
end
return rom_result or rom_title
else
return rom_title
end
end
-- A hack to bypass [[mod:languages]] bug [[special:diff/72585061]]
local f_tr = export.tr
function export.tr(...)
local rom = f_tr(...)
if rom then
return (rom:gsub("'", mw.getCurrentFrame():extensionTag('nowiki', '\'')))
end
end
return export