Modulus:IPA
This module is used by the templates {{IPA}}
and {{IPAchar}}
to format IPA in entries, and it also converts X-SAMPA (an ASCII version of IPA) to IPA for the templates {{x2i}}
, {{x2ipa}}
, and {{x2ipachar}}
. The actual functions called by these templates are found in Module:IPA/templates
The function format_IPA_full
generates the content of the template {{IPA}}
. It should also be used by pronunciation modules for specific languages. (It is used, for instance, by Module:ru-pron.) It generates a label IPA (key), followed by a list of IPA transcriptions with the class attribute "IPA"
added to them, and a language-specific category (such as Category:English terms with IPA pronunciation).
The function format_IPA_multiple
generates the content of the template {{IPAchar}}
. It is similar to format_IPA_full
, but does not add a label or categories.
Data is in Module:IPA/data, Module:IPA/data/symbols, and Module:IPA/data/X-SAMPA.
Unit tests
+/-- See also: Module:IPA/testcases
IPA to X-SAMPA back to IPA
+/-Term | IPA | Generated X-SAMPA | Regenerated IPA | Matched? |
---|---|---|---|---|
dictionary | /ˈdɪkʃən(ə)ɹi/ | /"dIkS@n(@)r\i/ |
/ˈdɪkʃən(ə)ɹi/ | yes |
/ˈdɪkʃənɛɹi/ | /"dIkS@nEr\i/ |
/ˈdɪkʃənɛɹi/ | yes | |
Україна (Ukrajina) | /ukrɑˈjɪnɑ/ | /ukrA"jInA/ |
/ukrɑˈjɪnɑ/ | yes |
نوروز (nuruz) | [næu̯ˈɾoːz] | [n{u_^"4o:z] |
[næu̯ˈɾoːz] | yes |
[nou̯ˈɾuːz] | [nou_^"4u:z] |
[nou̯ˈɾuːz] | yes | |
[noːˈɾuːz] | [no:"4u:z] |
[noːˈɾuːz] | yes | |
[næu̯ˈɾɵːz] | [n{u_^"48:z] |
[næu̯ˈɾɵːz] | yes | |
新年 | [ɕɪn˥˥niɛn˧˥] | [s\In__T__TniEn__M__T] |
[ɕɪn˥˥niɛn˧˥] | yes |
battleship | [ˈbætl̩ʃɪp] | ["b{tl=SIp] |
[ˈbætl̩ʃɪp] | yes |
báid | [bˠɑːdʲ] | [b_GA:d_j] |
[bˠɑːdʲ] | yes |
Deutsch | [dɔʏ̯t͡ʃ] | [dOY_^t__S] |
[dɔʏ̯t͡ʃ] | yes |
dóigh | [d̪ˠoːɟ] | [d_d_Go:J\] |
[d̪ˠoːɟ] | yes |
murder | [ˈmɝdɚ] | ["m3`d@`] |
[ˈmɝdɚ] | yes |
local export = {}
-- [[Module:IPA/data]]
local m_data = mw.loadData('Module:IPA/data') -- [[Module:IPA/data]]
local m_symbols = mw.loadData('Module:IPA/data/symbols') -- [[Module:IPA/data/symbols]]
local m_XSAMPA = mw.loadData('Module:IPA/data/X-SAMPA')
local m_syllables = require('Module:syllables') -- [[Module:syllables]]
local m_languages = require('Module:languages')
local m_links = require('Module:links')
local sub = mw.ustring.sub
local find = mw.ustring.find
local gsub = mw.ustring.gsub
local match = mw.ustring.match
local gmatch = mw.ustring.gmatch
local U = mw.ustring.char
function export.format_IPA_full(lang, items, err, separator)
local IPA_key, key_link, err_text, prefix, IPAs, category
local hasKey = m_data.langs_with_infopages
local namespace = mw.title.getCurrentTitle().nsText
if err then
err_text = '<span class="error">' .. err .. '</span>'
else
if hasKey[lang:getCode()] then
IPA_key = "Appendix:" .. lang:getCanonicalName() .. " pronunciation"
else
IPA_key = "wikipedia:" .. lang:getCanonicalName() .. " phonology"
end
key_link = "[[" .. IPA_key .. "|key]]"
end
local prefix = "[[Wiktionary:International Phonetic Alphabet|IPA]]<sup>(" .. ( key_link or err_text ) .. ")</sup>: "
IPAs = export.format_IPA_multiple(lang, items, separator)
if lang and (namespace == "" or namespace == "Reconstruction") then
category = "[[Category:" .. lang:getCanonicalName() .. " terms with IPA pronunciation]]"
else
category = ""
end
return prefix .. IPAs .. category
end
function determine_repr(pron)
local repr_mark = {}
local repr, reconstructed
-- remove initial asterisk before representation marks, used on some Reconstruction pages
if find(pron, "^%*") then
reconstructed = true
pron = sub(pron, 2)
end
local representation_types = {
['/'] = { right = '/', type = 'phonemic', },
['['] = { right = ']', type = 'phonetic', },
['⟨'] = { right = '⟩', type = 'orthographic', },
['-'] = { type = 'rhyme' },
}
repr_mark.i, repr_mark.f, repr_mark.left, repr_mark.right = find(pron, '^(.).-(.)$')
representation_type = representation_types[repr_mark.left]
if representation_type then
if representation_type.right then
if repr_mark.right == representation_type.right then
repr = representation_type.type
end
else
repr = representation_type.type
end
else
repr = nil
end
return repr, reconstructed
end
function hasInvalidSeparators(transcription)
if find(transcription, "%.[ˈˌ]") then
return true
else
return false
end
end
function export.format_IPA_multiple(lang, items, separator)
notes = notes or {}
local categories = {}
separator = separator or ', '
-- Format
if #items == 0 then
if mw.title.getCurrentTitle().nsText == "Template" then
table.insert(items, {pron = "/aɪ piː ˈeɪ/"})
else
table.insert(categories, "[[Category:Pronunciation templates without a pronunciation]]")
end
end
local bits = {}
for _, item in ipairs(items) do
local bit = export.format_IPA(lang, item.pron)
if item.qualifiers and #item.qualifiers > 0 then
bit = require("Module:qualifier").format_qualifier(item.qualifiers) .. " " .. bit
end
if item.note then
bit = bit .. mw.getCurrentFrame():extensionTag("ref", item.note)
end
table.insert(bits, bit)
--[=[ [[Special:WhatLinksHere/Template:tracking/IPA/syntax-error]]
The length or gemination symbol should not appear after a syllable break or stress symbol. ]=]
if find(item.pron, "[ˈˌ%.][ːˑ]") then
require("Module:debug").track("IPA/syntax-error")
end
if lang then
-- Add syllable count if the language's diphthongs are listed in [[Module:syllables]].
if mw.title.getCurrentTitle().namespace == 0 then
if m_syllables.hasDiphthongs(lang) then
if determine_repr(item.pron) == "phonemic" then
local syllable_count = m_syllables.getVowels(item.pron, lang)
if syllable_count then
table.insert(categories, "[[Category:" .. lang:getCanonicalName() .. " " .. syllable_count .. "-syllable words]]")
end
end
end
end
if lang:getCode() == "en" then
if hasInvalidSeparators(item.pron) then
table.insert(categories, "[[Category:IPA for English using .ˈ or .ˌ]]")
end
end
end
end
return table.concat(bits, separator) .. table.concat(categories)
end
-- Takes an IPA pronunciation and formats it and adds cleanup categories.
function export.format_IPA(lang, pron, split_output)
local err = {}
local categories = {}
-- Detect whether this is a phonemic or phonetic transcription
local repr, reconstructed = determine_repr(pron)
if reconstructed then
pron = sub(pron, 2)
end
-- If valid, strip the representation marks
if repr == "phonemic" then
pron = sub(pron, 2, -2)
elseif repr == "phonetic" then
pron = sub(pron, 2, -2)
elseif repr == "orthographic" then
pron = sub(pron, 2, -2)
elseif repr == "rhyme" then
pron = sub(pron, 2)
else
table.insert(categories, "[[Category:IPA pronunciations with invalid representation marks]]")
-- table.insert(err, "invalid representation marks")
-- Removed because it's annoying when previewing pronunciation pages.
end
-- Check for obsolete and nonstandard symbols
for i, symbol in ipairs(m_data.nonstandard) do
local result = {}
for nonstandard in gmatch(pron, symbol) do
table.insert(result, nonstandard)
table.insert(categories, "[[Category:IPA pronunciations with obsolete or nonstandard characters|" .. nonstandard .. "]]")
end
if #result > 0 then
table.insert(err, "obsolete or nonstandard characters (" .. table.concat(result) .. ")")
break
end
end
--[[ Check for invalid symbols after removing the following:
1. wikilinks
2. paired HTML tags
3. bolding
4. italics
5. HTML entity for space
6. asterisk at beginning of transcription
7. comma followed by spacing characters
8. superscripts enclosed in superscript parentheses ]]
local result = gsub(pron, '%[%[(.*)%]%]', '%1')
result = gsub(result, "<(%l+)[^>]*>([^<]+)</%1>", "%2")
result = gsub(result, "'''([^']*)'''", "%1")
result = gsub(result, "''([^']*)''", "%1")
result = gsub(result, " ", "")
result = gsub(result, "^%*", "")
result = gsub(result, ",%s+", "")
result = gsub(result, "⁽[".. m_symbols.superscripts .. "]+⁾", "")
result = gsub(result, '[' .. m_symbols.valid .. ']', '')
if result ~= '' then
local suggestions = {}
mw.log(pron,result)
local namespace = mw.title.getCurrentTitle().namespace
local category
if namespace == 0 then
-- main namespace
category = "IPA pronunciations with invalid IPA characters"
elseif namespace == 118 then
-- reconstruction namespace
category = "IPA pronunciations with invalid IPA characters/reconstruction"
else
category = "IPA pronunciations with invalid IPA characters/non_mainspace"
end
for character in gmatch(result, ".") do
local suggestion = m_symbols.suggestions[character]
if suggestion then
table.insert(suggestions, character .. " with " .. suggestion)
end
table.insert(categories, "[[Category:" .. category .. "|" .. character .. "]]")
end
table.insert(err, "invalid IPA characters (" .. result .. ")")
if #suggestions > 0 then
table.insert(err, "replace " .. table.concat(suggestions, ", "))
end
end
-- Reference inside IPA template usage
-- FIXME: Doesn't work; you can't put HTML in module output.
--if mw.ustring.find(pron, '</ref>') then
-- table.insert(categories, "[[Category:IPA pronunciations with reference]]")
--end
if repr == "phonemic" or repr == "rhyme" then
if lang and m_data.phonemes[lang:getCode()] then
local valid_phonemes = m_data.phonemes[lang:getCode()]
local rest = pron
local phonemes = {}
while mw.ustring.len(rest) > 0 do
local longestmatch = ""
if sub(rest, 1, 1) == "(" or sub(rest, 1, 1) == ")" then
longestmatch = sub(rest, 1, 1)
else
for _, phoneme in ipairs(valid_phonemes) do
if mw.ustring.len(phoneme) > mw.ustring.len(longestmatch) and sub(rest, 1, mw.ustring.len(phoneme)) == phoneme then
longestmatch = phoneme
end
end
end
if mw.ustring.len(longestmatch) > 0 then
table.insert(phonemes, longestmatch)
rest = sub(rest, mw.ustring.len(longestmatch) + 1)
else
local phoneme = sub(rest, 1, 1)
table.insert(phonemes, "<span style=\"color: red\">" .. phoneme .. "</span>")
rest = sub(rest, 2)
table.insert(categories, "[[Category:IPA pronunciations with invalid phonemes/" .. lang:getCode() .. "]]")
require("Module:debug").track("IPA/invalid phonemes/" .. phoneme)
end
end
pron = table.concat(phonemes)
end
if repr == "phonemic" then
pron = "/" .. pron .. "/"
else
pron = "-" .. pron
end
elseif repr == "phonetic" then
pron = "[" .. pron .. "]"
elseif repr == "orthographic" then
pron = "⟨" .. pron .. "⟩"
end
if reconstructed then
pron = "*" .. pron
end
if #err > 0 then
err = ' <span class="previewonly error" style="font-size: small;>' .. table.concat(err, ', ') .. '</span>'
else
err = ""
end
if split_output then -- for use of IPA in links
return '<span class="IPA" lang="">' .. pron .. '</span>', table.concat(categories), err
else
return '<span class="IPA" lang="">' .. pron .. '</span>' .. err .. table.concat(categories)
end
end
-- IPA <-> XSAMPA lookup tables
local i2x_lookup = {}
function Populate_IPA_XSAMPA_LookupTables()
if #i2x_lookup == 0 then
for XSAMPA_symbol, data in pairs(m_XSAMPA) do
local IPA_symbol = data[1]
i2x_lookup[IPA_symbol] = XSAMPA_symbol
local with_descender = data.with_descender
if with_descender then
i2x_lookup[with_descender] = XSAMPA_symbol
end
end
end
return i2x_lookup
end
function export.IPA_to_XSAMPA(text)
Populate_IPA_XSAMPA_LookupTables()
local escape = false
if type(text) == 'table' then -- a frame, extract args
text = text.args[1]
text = text:gsub('{{=}}','='):gsub('{{!}}','|')
text = mw.text.decode(text) -- XXX
escape = true
end
text = gsub(text, 'ːː', ':') -- this basically sums up m_symbols[2].XSAMPA
text = gsub(text, '.', i2x_lookup)
if escape then
text = mw.text.nowiki(text)
end
return text
end
function export.XSAMPA_to_IPA(text)
local data = m_XSAMPA
local escape = false
if type(text) == 'table' then -- a frame, extract args
text = text.args[1]
text = mw.text.decode(text) -- XXX
escape = true
end
-- Simpler function adapted from [[w:Module:Sandbox/Erutuon/X-SAMPA]]
local output, characteristics = {}, {}
local angle_bracket
if sub(text, 1, 1) == "<" and sub(text, -1) == ">" then
table.insert(output, "⟨")
angle_bracket = "⟩"
text = sub(text, 2, -2)
end
while #text > 0 do
local substrings = {
sub(text, 1, 4),
sub(text, 1, 3),
sub(text, 1, 2),
sub(text, 1, 1)
}
for i, substring in ipairs(substrings) do
local result, IPA, with_descender, has_descender, is_diacritic
if data[substring] then
result = data[substring]
IPA = result[1]
with_descender = result.with_descender
has_descender = result.has_descender
diacritic = result.is_diacritic
if with_descender then
-- Go backwords through the transcription, skipping any diacritics.
local i = 0
while characteristics[#characteristics - i].is_diacritic do
i = i + 1
end
--[[ Look at the first non-diacritic symbol before the current symbol.
If it has a descender, use the descender form of the current symbol. ]]
if characteristics[#characteristics - i].has_descender then
IPA = with_descender
end
end
elseif not substrings[i + 1] then
IPA = substring
end
if IPA then
text = sub(text, 6 - i)
table.insert(output, IPA)
table.insert(characteristics, { has_descender = has_descender, is_diacritic = is_diacritic } )
break
end
end
end
table.insert(output, angle_bracket)
output = table.concat(output)
if escape then
-- output = mw.text.nowiki(output)
end
return output
end
function export.example(frame)
local output = {}
table.insert(
output,
[[
{| class="wikitable"
! Term !! IPA !! Generated X-SAMPA !! Regenerated IPA !! Matched?
]]
)
local row =
[[
|-
| link || IPA || XSAMPA || regenerated_IPA || matched
]]
local examples = mw.text.split(frame.args[1], ",%s*")
for _, example in pairs(examples) do
local lang, word = match(example, "(%l%l%l?):(.+) [/%[]")
if lang then
lang = m_languages.getByCode(lang) or error('"' .. lang .. '" is not a valid language code.')
end
local IPA = match(example, "/[^/]+/")
or match(example, "%[[^%]]+%]")
or error('No IPA transcription found in "' .. example .. '".')
local XSAMPA = export.IPA_to_XSAMPA(IPA)
local regenerated_IPA = export.XSAMPA_to_IPA(XSAMPA)
content = {
link = lang and word and m_links.full_link{ term = word, lang = lang },
matched = IPA == regenerated_IPA
and '<span style="color: green;">yes</span>'
or '<span style="color: red;">no</span>',
IPA = '<span class="IPA">' .. IPA .. '</span>',
XSAMPA = '<code>' .. XSAMPA .. '</span>',
regenerated_IPA = '<span class="IPA">' .. regenerated_IPA .. '</span>'
}
local function add_content(item)
return content[item] or ""
end
local row = gsub(row, "[%a_]+", add_content)
table.insert(output, row)
end
table.insert(output, "|}")
return table.concat(output)
end
return export