--- romkan.py 2007-02-01 17:36:59.000000000 -0500 +++ uromkan.py 2007-02-02 12:26:39.000000000 -0500 @@ -1,5 +1,5 @@ -#!/usr/bin/python -# encoding: euc-jp +#!/usr/bin/env python +# encoding: utf-8 """romkan.py - a Python rewrite of the Perl Romaji<->Kana conversion module. @@ -10,12 +10,15 @@ This is free software with ABSOLUTELY NO WARRANTY. You can redistribute it and/or modify it under the terms of -the GNU General Public License version 2.""" +the GNU General Public License version 2. -__author__ = "Eric Nichols" -__author_email__ = "eric-n@is.naist.jp" -__version__ = "0.02" -__revision__ = __version__ +Modified by Jason Moiron to work with utf-8 instead of euc-jp +""" + +__author__ = "Jason Moiron" +__author_email__ = "jmoiron@jmoiron.net" +__version__ = "0.02u" +__revision__ = "1" import re @@ -29,79 +32,79 @@ """ return """ -ぁ xa あ a ぃ xi い i ぅ xu -う u う゛ vu う゛ぁ va う゛ぃ vi う゛ぇ ve -う゛ぉ vo ぇ xe え e ぉ xo お o - -か ka が ga き ki きゃ kya きゅ kyu -きょ kyo ぎ gi ぎゃ gya ぎゅ gyu ぎょ gyo -く ku ぐ gu け ke げ ge こ ko -ご go - -さ sa ざ za し si しゃ sya しゅ syu -しょ syo じ zi じゃ zya じゅ zyu じょ zyo -す su ず zu せ se ぜ ze そ so -ぞ zo - -た ta だ da ち ti ちゃ tya ちゅ tyu -ちょ tyo ぢ di ぢゃ dya ぢゅ dyu ぢょ dyo - -っ xtu -っう゛ vvu っう゛ぁ vva っう゛ぃ vvi -っう゛ぇ vve っう゛ぉ vvo -っか kka っが gga っき kki っきゃ kkya -っきゅ kkyu っきょ kkyo っぎ ggi っぎゃ ggya -っぎゅ ggyu っぎょ ggyo っく kku っぐ ggu -っけ kke っげ gge っこ kko っご ggo っさ ssa -っざ zza っし ssi っしゃ ssya -っしゅ ssyu っしょ ssho -っじ zzi っじゃ zzya っじゅ zzyu っじょ zzyo -っす ssu っず zzu っせ sse っぜ zze っそ sso -っぞ zzo った tta っだ dda っち tti -っちゃ ttya っちゅ ttyu っちょ ttyo っぢ ddi -っぢゃ ddya っぢゅ ddyu っぢょ ddyo っつ ttu -っづ ddu って tte っで dde っと tto っど ddo -っは hha っば bba っぱ ppa っひ hhi -っひゃ hhya っひゅ hhyu っひょ hhyo っび bbi -っびゃ bbya っびゅ bbyu っびょ bbyo っぴ ppi -っぴゃ ppya っぴゅ ppyu っぴょ ppyo っふ hhu -っふぁ ffa っふぃ ffi っふぇ ffe っふぉ ffo -っぶ bbu っぷ ppu っへ hhe っべ bbe っぺ ppe -っほ hho っぼ bbo っぽ ppo っや yya っゆ yyu -っよ yyo っら rra っり rri っりゃ rrya -っりゅ rryu っりょ rryo っる rru っれ rre -っろ rro - -つ tu づ du て te で de と to -ど do - -な na に ni にゃ nya にゅ nyu にょ nyo -ぬ nu ね ne の no - -は ha ば ba ぱ pa ひ hi ひゃ hya -ひゅ hyu ひょ hyo び bi びゃ bya びゅ byu -びょ byo ぴ pi ぴゃ pya ぴゅ pyu ぴょ pyo -ふ hu ふぁ fa ふぃ fi ふぇ fe ふぉ fo -ぶ bu ぷ pu へ he べ be ぺ pe -ほ ho ぼ bo ぽ po - -ま ma み mi みゃ mya みゅ myu みょ myo -む mu め me も mo - -ゃ xya や ya ゅ xyu ゆ yu ょ xyo -よ yo - -ら ra り ri りゃ rya りゅ ryu りょ ryo -る ru れ re ろ ro - -ゎ xwa わ wa ゐ wi ゑ we -を wo ん n - -ん n' -でぃ dyi -ー - -ちぇ tye -っちぇ ttye + xa a xi i xu + u vu va vi ve + vo xe e xo o + + ka ga ki kya kyu + kyo gi gya gyu gyo + ku gu ke ge ko + go + + sa za si sya syu + syo zi zya zyu zyo + su zu se ze so + zo + + ta da ti < tya < tyu +< tyo di ≪ dya ≪ dyu ≪ dyo + + xtu +c vvu c vva c vvi +c vve c vvo +c kka c gga c kki c kkya +c kkyu c kkyo c ggi c ggya +c ggyu c ggyo c kku c ggu +c kke c gge c kko c ggo c ssa +c zza c ssi c ssya +c ssyu c ssho +c zzi c zzya c zzyu c zzyo +c ssu c zzu c sse c zze c sso +c zzo c tta c dda c tti +c< ttya c< ttyu c< ttyo c ddi +c≪ ddya c≪ ddyu c≪ ddyo c ttu +c ddu c tte c dde c tto c ddo +c hha c bba c ppa c hhi +c蚊 hhya c蚊 hhyu c蚊 hhyo c bbi +c潟 bbya c潟 bbyu c潟 bbyo c ppi +c眼 ppya c眼 ppyu c眼 ppyo c hhu +c泣 ffa c泣 ffi c泣 ffe c泣 ffo +c bbu c ppu c hhe c bbe c ppe +c hho c bbo c ppo c yya c yyu +c yyo c rra c rri c rrya +c rryu c rryo c rru c rre +c rro + + tu du te de to + do + + na ni nya nyu nyo + nu ne no + + ha ba pa hi 蚊 hya +蚊 hyu 蚊 hyo bi 潟 bya 潟 byu +潟 byo pi 眼 pya 眼 pyu 眼 pyo + hu 泣 fa 泣 fi 泣 fe 泣 fo + bu pu he be pe + ho bo po + + ma mi 帥 mya 帥 myu 帥 myo + mu me mo + + xya ya xyu yu xyo + yo + + ra ri rya ryu ryo + ru re ro + + xwa wa wi we + wo n + + n' +с dyi + - +< tye +c< ttye """ def get_hepburntab(): @@ -114,80 +117,80 @@ """ return """ -ぁ xa あ a ぃ xi い i ぅ xu -う u う゛ vu う゛ぁ va う゛ぃ vi う゛ぇ ve -う゛ぉ vo ぇ xe え e ぉ xo お o + xa a xi i xu + u vu va vi ve + vo xe e xo o -か ka が ga き ki きゃ kya きゅ kyu -きょ kyo ぎ gi ぎゃ gya ぎゅ gyu ぎょ gyo -く ku ぐ gu け ke げ ge こ ko -ご go - -さ sa ざ za し shi しゃ sha しゅ shu -しょ sho じ ji じゃ ja じゅ ju じょ jo -す su ず zu せ se ぜ ze そ so -ぞ zo - -た ta だ da ち chi ちゃ cha ちゅ chu -ちょ cho ぢ di ぢゃ dya ぢゅ dyu ぢょ dyo - -っ xtsu -っう゛ vvu っう゛ぁ vva っう゛ぃ vvi -っう゛ぇ vve っう゛ぉ vvo -っか kka っが gga っき kki っきゃ kkya -っきゅ kkyu っきょ kkyo っぎ ggi っぎゃ ggya -っぎゅ ggyu っぎょ ggyo っく kku っぐ ggu -っけ kke っげ gge っこ kko っご ggo っさ ssa -っざ zza っし sshi っしゃ ssha -っしゅ sshu っしょ ssho -っじ jji っじゃ jja っじゅ jju っじょ jjo -っす ssu っず zzu っせ sse っぜ zze っそ sso -っぞ zzo った tta っだ dda っち cchi -っちゃ ccha っちゅ cchu っちょ ccho っぢ ddi -っぢゃ ddya っぢゅ ddyu っぢょ ddyo っつ ttsu -っづ ddu って tte っで dde っと tto っど ddo -っは hha っば bba っぱ ppa っひ hhi -っひゃ hhya っひゅ hhyu っひょ hhyo っび bbi -っびゃ bbya っびゅ bbyu っびょ bbyo っぴ ppi -っぴゃ ppya っぴゅ ppyu っぴょ ppyo っふ ffu -っふぁ ffa っふぃ ffi っふぇ ffe っふぉ ffo -っぶ bbu っぷ ppu っへ hhe っべ bbe っぺ ppe -っほ hho っぼ bbo っぽ ppo っや yya っゆ yyu -っよ yyo っら rra っり rri っりゃ rrya -っりゅ rryu っりょ rryo っる rru っれ rre -っろ rro - -つ tsu づ du て te で de と to -ど do - -な na に ni にゃ nya にゅ nyu にょ nyo -ぬ nu ね ne の no - -は ha ば ba ぱ pa ひ hi ひゃ hya -ひゅ hyu ひょ hyo び bi びゃ bya びゅ byu -びょ byo ぴ pi ぴゃ pya ぴゅ pyu ぴょ pyo -ふ fu ふぁ fa ふぃ fi ふぇ fe ふぉ fo -ぶ bu ぷ pu へ he べ be ぺ pe -ほ ho ぼ bo ぽ po - -ま ma み mi みゃ mya みゅ myu みょ myo -む mu め me も mo - -ゃ xya や ya ゅ xyu ゆ yu ょ xyo -よ yo - -ら ra り ri りゃ rya りゅ ryu りょ ryo -る ru れ re ろ ro - -ゎ xwa わ wa ゐ wi ゑ we -を wo ん n - -ん n' -でぃ dyi -ー - -ちぇ che -っちぇ cche + ka ga ki kya kyu + kyo gi gya gyu gyo + ku gu ke ge ko + go + + sa za shi sha shu + sho ji ja ju jo + su zu se ze so + zo + + ta da chi < cha < chu +< cho di ≪ dya ≪ dyu ≪ dyo + + xtsu +c vvu c vva c vvi +c vve c vvo +c kka c gga c kki c kkya +c kkyu c kkyo c ggi c ggya +c ggyu c ggyo c kku c ggu +c kke c gge c kko c ggo c ssa +c zza c sshi c ssha +c sshu c ssho +c jji c jja c jju c jjo +c ssu c zzu c sse c zze c sso +c zzo c tta c dda c cchi +c< ccha c< cchu c< ccho c ddi +c≪ ddya c≪ ddyu c≪ ddyo c ttsu +c ddu c tte c dde c tto c ddo +c hha c bba c ppa c hhi +c蚊 hhya c蚊 hhyu c蚊 hhyo c bbi +c潟 bbya c潟 bbyu c潟 bbyo c ppi +c眼 ppya c眼 ppyu c眼 ppyo c ffu +c泣 ffa c泣 ffi c泣 ffe c泣 ffo +c bbu c ppu c hhe c bbe c ppe +c hho c bbo c ppo c yya c yyu +c yyo c rra c rri c rrya +c rryu c rryo c rru c rre +c rro + + tsu du te de to + do + + na ni nya nyu nyo + nu ne no + + ha ba pa hi 蚊 hya +蚊 hyu 蚊 hyo bi 潟 bya 潟 byu +潟 byo pi 眼 pya 眼 pyu 眼 pyo + fu 泣 fa 泣 fi 泣 fe 泣 fo + bu pu he be pe + ho bo po + + ma mi 帥 mya 帥 myu 帥 myo + mu me mo + + xya ya xyu yu xyo + yo + + ra ri rya ryu ryo + ru re ro + + xwa wa wi we + wo n + + n' +с dyi + - +< che +c< cche """ def init_rkdict(table): @@ -350,6 +353,8 @@ # EUC-JP kana codes CHAR = "(?:[\x00-\x7f]|(?:\x8f[\xa1-\xfe]|[\x8e\xa1-\xfe])[\xa1-\xfe])" +# UTF-8 kana codes (i kept ascii because i'm not sure why it's there) +CHAR = "(?:[\x00-\x7f]|(?:\xe3\x82[\x81-\xbf])|(?:\xe3\x83[\x80-\xbc]))" # Romaji -> Kana cr_re = re.compile(r"(%s*?)(%s)" % (CHAR, rompat, )) @@ -375,27 +380,55 @@ word = normalize_double_n(word) word = ck_re.sub(lambda m: m.groups()[0] + kanroms[m.groups()[1]], word) word = n_re.sub("n", word) + # small katakana letters don't get the 'x' taken out when they are + # 'improperly' used to lengthen vowel sounds (ex. - -> ka-bii) + word = word.replace('x', '') return word # Hiragana -> Katakana -h_re = re.compile(r"\xa4(.)") def hirakata(word): """hirakata(string) -> string Converts hiragana string into katakana. """ - word = h_re.sub(r"\xa5\1", word) - return word + s = u'' + uniword = word.decode('utf-8') + for char in uniword: + if ord(char) > 0x3040 and ord(char) < 0x3097: + s += unichr(ord(char) + 0x60) + else: + s += char + return s.encode('utf-8') # Katakana -> Hiragana -k_re = re.compile(r"\xa5(.)") def katahira(word): """katahira(string) -> string Converts katakana string into hiragana. """ - word = k_re.sub(r"\xa4\1", word) - return word + s = u'' + uniword = word.decode('utf-8') + for char in uniword: + if ord(char) > 0x30A0 and ord(char) < 0x30F7: + s += unichr(ord(char) - 0x60) + else: + s += char + return s.encode('utf-8') + +def defullw(word): + """defullw(string) -> string + + Converts Fullwidth unicode characters to ascii equivalents. + + """ + s = u'' + uniword = word.decode('utf-8') + for char in uniword: + if ord(char) >= 0xFF00 and ord(char) <= 0xff5f: + s += unichr(ord(char) - 0xfee0) + else: + s += char + return s.encode('utf-8')