မော်ဂျူး:Jpan-sortkey
Documentation for this module may be created at မော်ဂျူး:Jpan-sortkey/doc
local export = {}
local kanji_pattern = mw.loadData("Module:ja/data/range").kanji
local ideograph_pattern = mw.loadData("Module:ja/data/range").ideograph
local kana_graph_pattern = mw.loadData("Module:ja/data/range").kana_graph
local latin_pattern = mw.loadData("Module:ja/data/range").latin
local find = mw.ustring.find
local gsub = mw.ustring.gsub
local toNFC = mw.ustring.toNFC
local toNFD = mw.ustring.toNFD
local u = mw.ustring.char
local UTF8_char = "[%z\1-\127\194-\244][\128-\191]*"
function export.sortkey_from_string(text, lang, sc)
text = mw.ustring.toNFD(require("Module:ja").kata_to_hira(text))
-- If the first character has dakuten, replace it with the corresponding character without dakuten and add an apostrophe to the end, e.g. がす > かす'
text = text:gsub("^(" .. UTF8_char .. ")" .. u(0x3099) .. "(.*)", "%1%2'")
-- Similar thing, but with handuken and two apostrophes, e.g. ぱす -> はす''
text = text:gsub("^(" .. UTF8_char .. ")" .. u(0x309A) .. "(.*)", "%1%2''")
-- Replace the long vowel mark with the vowel that it stands for
if text:match("ー") then
local from = {
"あぁかさたなはまやゃらわ",
"いぃきしちにひみり",
"うぅくすつぬふむゆゅる",
"えぇけせてねへめれ",
"おぉこそとのほもよょろ",
"ん"
}
local to = {"あ", "い", "う", "え", "お", "ん"}
local dh = u(0x3099) .. u(0x309A)
for i, v in ipairs(from) do
text = mw.ustring.gsub(text, "([" .. v .. "][" .. dh .. "]?)ー", "%1" .. to[i])
end
end
text = gsub(text, "[・゠]", " ")
local ret = require("Module:Hani-sortkey").makeSortKey(text, lang, sc)
if not (lang == "mul" or ret == text) then
require("Module:debug/track"){"Jpan-sortkey/fallback", "Jpan-sortkey/fallback/" .. lang}
end
return ret
end
function export.makeSortKey(text, lang, sc)
local langname = require("Module:languages").getByCode(lang):getCanonicalName()
local seen_pages = {}
local function scrape_page(text)
seen_pages[text] = true
local content = mw.title.new(toNFC(text)):getContent()
if content then
local loc1, loc2 = content:find("%f[^%z%s]==%s*" .. langname:gsub("%-", "%%%-") .. "%s*==()")
local loc2 = content:find("%f[^%z%s]==[^\n=]+==", loc2)
if loc1 then
content = content:sub(loc1, loc2)
local findTemplates = require("Module:templateparser").findTemplates
local kanjitab, br
for template, args in findTemplates(content) do
local templates = {
[lang .. "-head"] = true,
[lang .. "-pos"] = true,
}
if templates[template] and args[2] then
text = args[2]:gsub("[ %-%.^%%]", "")
br = true
break
elseif (template == "head" or template == "head-lite") and args[1] == lang then
for i, arg in ipairs(args) do
if arg == "kana" then
local kana = args[i+1]
if kana then
text = kana
br = true
break
end
end
end
end
templates = {
[lang .. "-noun"] = true,
[lang .. "-verb"] = true,
[lang .. "-adj"] = true,
[lang .. "-phrase"] = true,
[lang .. "-verb form"] = true,
[lang .. "-verb-suru"] = true,
[lang .. "-see"] = true,
[lang .. "-see-kango"] = true,
[lang .. "-gv"] = true,
}
if templates[template] and args[1] then
text = args[1]:gsub("[ %-%.^%%]", "")
br = true
break
elseif template == lang .. "-kanjitab" then
kanjitab = kanjitab or args
end
end
if (not br) and kanjitab then
require("Module:debug/track"){"Jpan-sortkey/kanjitab", "Jpan-sortkey/kanjitab/" .. lang}
if kanjitab.sortkey then
return kanjitab.sortkey
end
-- extract kanji and non-kanji
local kanji = {}
local non_kanji = {}
local kanji_border = 1
mw.ustring.gsub(text, "()([" .. kanji_pattern .. "々])()", function(p1, w1, p2)
table.insert(non_kanji, mw.ustring.sub(text, kanji_border, p1 - 1))
kanji_border = p2
table.insert(kanji, w1)
end)
table.insert(non_kanji, mw.ustring.sub(text, kanji_border))
-- 々
for i, v in ipairs(kanji) do
if v == "々" then kanji[i] = kanji[i - 1] end
end
-- process readings
local readings = {}
local readings_actual = {}
local reading_length_total = 0
for i in ipairs(kanjitab) do
local reading_kana, reading_length
_, _, reading_kana, reading_length = mw.ustring.find(kanjitab[i] or "", "^([^0-9]*)([0-9]*)$")
reading_kana = reading_kana ~= "" and reading_kana or nil
reading_length = reading_kana and tonumber(reading_length) or 1
table.insert(readings, {reading_kana, reading_length})
reading_length_total = reading_length_total + reading_length
for i = reading_length_total + 1, #kanji do
table.insert(readings, {nil, 1})
end
if reading_kana then
local actual_reading = kanjitab["k" .. i]
local okurigana = kanjitab["o" .. i]
readings_actual[i] = {(actual_reading or reading_kana) .. (okurigana or ""), reading_length}
else
readings_actual[i] = {nil, 1}
end
end
local sortkey = {non_kanji[1]}
local id = 1
for _, v in ipairs(readings_actual) do
id = id + v[2]
v[1] = v[1] ~= "-" and v[1]
table.insert(sortkey, (v[1] or "") .. (non_kanji[id] or ""))
end
sortkey = table.concat(sortkey)
if sortkey ~= "" then
text = sortkey
end
end
end
end
return text
end
while lang ~= "mul" and (not seen_pages[text]) and find(text, "[0-9" .. kanji_pattern .. ideograph_pattern .. kana_graph_pattern .. latin_pattern .. "]") do
text = scrape_page(text)
end
return export.sortkey_from_string(text, lang, sc)
end
return export