Skip to content

Commit

Permalink
Merge pull request #7 from kyohsuke/bump_up_ruby_2_2_x
Browse files Browse the repository at this point in the history
convert ruby 2.2.x
  • Loading branch information
yutopia committed May 8, 2016
2 parents 57f1bbd + c99833a commit 5d238e8
Show file tree
Hide file tree
Showing 10 changed files with 239 additions and 225 deletions.
3 changes: 2 additions & 1 deletion convert2skk/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -322,4 +322,5 @@ notes 辞書に転用できますし、やり方によっては SKK 以外にも
こうじゅつろうどく /口述朗読;‖<autogen>,名詞-サ変接続/

## 著者
三田祐介 <[email protected]>

三田祐介 < clefs<span></span>@mail.goo.ne.jp >
20 changes: 11 additions & 9 deletions convert2skk/aozora2skk.rb
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
#!/usr/local/bin/ruby -Ke
# -*- coding: euc-jp -*-
#!/usr/bin/env ruby
# -*- coding: utf-8 -*-
## Copyright (C) 2005 MITA Yuusuke <[email protected]>
##
## Author: MITA Yuusuke <[email protected]>
Expand Down Expand Up @@ -32,11 +32,11 @@
##
## % aozora2skk.rb file-from-aozora-bunko.html > result.txt
##
# ○
require 'jcode' if RUBY_VERSION.to_f < 1.9
#require 'kconv'
#require 'skkdictools'
# ○

Encoding.default_external = "euc-jis-2004"
require 'optparse'

opt = OptionParser.new

results = []
Expand All @@ -45,18 +45,20 @@
opt.on('-a', 'append annotation <autogen - aozora>') { note = true }
begin
opt.parse!(ARGV)
rescue OptionParser::InvalidOption => e
rescue OptionParser::InvalidOption
print "'#{$0} -h' for help.\n"
exit 1
end



while gets
$_.encode!("utf-8")
$_.gsub!(/<[^>]*>/, '')
results = results + $_.scan(/([亜-熙]{2,})[  ]*[\[(([〔【]([ぁ-ん]*)[\]))〕]】]/)
results = results + $_.scan(/([亜-熙]{2,})[  ]*[\[(([〔【]([ぁ-ん]*)[\]))〕]】]/)
end

results.uniq!
results.each {|word,yomi|
print "#{yomi} /#{word}#{note ? ';<autogen - aozora>' : ''}/\n"
print "#{yomi} /#{word}#{note ? ';<autogen - aozora>' : ''}/\n"
}
9 changes: 5 additions & 4 deletions convert2skk/canna2skk.rb
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
#!/usr/bin/env ruby
# -*- coding: euc-jp -*-
# -*- coding: utf-8 -*-
# canna2skk.rb -- convert Canna dictionary to SKK-JISYO format.
#
# Copyright (C) 2003 NAKAJIMA Mikio <[email protected]>
Expand Down Expand Up @@ -33,8 +33,9 @@
# $ canna2skk.rb gcanna.t gcannaf.t > tmp.jisyo
# $ skkdic-expr2 tmp.jisyo > SKK-JISYO.canna
#
# ¤«¤ó #JS*8 ´¬ #CNSUC2*2 ´Ö #JS ´Ì ´Ó #JSSUC ´Ö
# かん #JS*8 #CNSUC2*2 #JS 缶 貫 #JSSUC

Encoding.default_external = "euc-jis-2004"
file = ARGV.shift
open(file).each{|line|
if !(line =~ /([^ ]+) (.+) *$/)
Expand All @@ -44,9 +45,9 @@
words = $2
words.split(' ').each{|word|
if (word =~ /[#*a-zA-Z0-9]+/ || key == word)
next
next
else
print key, " /", word, "/\n"
print key, " /", word, "/\n"
end
}
end
Expand Down
92 changes: 45 additions & 47 deletions convert2skk/chasen2skk.rb
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
#!/usr/local/bin/ruby -Ke
# -*- coding: euc-jp -*-
#!/usr/bin/env ruby
# -*- coding: utf-8 -*-

## Copyright (C) 2005 MITA Yuusuke <[email protected]>
##
## Author: MITA Yuusuke <[email protected]>
Expand Down Expand Up @@ -36,19 +37,15 @@
##
## skkdictools.rb required.
##
## TODO: pick up compound-verbs, eg. 「舞い散る」
## 舞い マイ 舞う 動詞-自立 五段・ワ行促音便 連用形
## 散る チル 散る 動詞-自立 五段・ラ行 基本形
## TODO: pick up compound-verbs, eg. 「舞い散る」
## 舞い マイ 舞う 動詞-自立 五段・ワ行促音便 連用形
## 散る チル 散る 動詞-自立 五段・ラ行 基本形
##
require 'jcode' if RUBY_VERSION.to_f < 1.9
require 'kconv'
require 'skkdictools'

#require 'cgi'
#require 'socket'
#require 'timeout'

Encoding.default_external = "euc-jis-2004"
require_relative 'skkdictools'
require 'optparse'

opt = OptionParser.new

katakana_words = false
Expand Down Expand Up @@ -77,62 +74,63 @@

begin
opt.parse!(ARGV)
rescue OptionParser::InvalidOption => e
rescue OptionParser::InvalidOption
print "'#{$0} -h' for help.\n"
exit 1
end

#keyword_pat = Regexp.compile("[亜-熙]*#{keyword}[亜-熙]*")
#keyword_pat = Regexp.compile("[亜-熙]*#{keyword}[亜-熙]*")

count = 0
#key = word = last_key = last_word = last_part = ""
key = word = last_part = ""
poisoned = terminate = false

while gets
midasi, yomi, root, part, conj = $_.split(" ", 5)
#if midasi !~ /^[亜-熙ァ-ンヴー]+$/ || terminate
if (midasi !~ /^[亜-熙ァ-ンヴー々]+$/ &&
(!allow_noun_chains || part !~ /名詞/ || part =~ /非自立/ ||
midasi !~ /^[亜-熙ァ-ンヴー々ぁ-ん]+$/ )) || terminate
#if (midasi !~ /^[亜-熙ァ-ンヴー]+$/ && conj !~ /連用形/) || terminate
$_.encode!("utf-8")
midasi, yomi, _root, part, _conj = $_.split(" ", 5)
#if midasi !~ /^[亜-熙ァ-ンヴー]+$/ || terminate
if (midasi !~ /^[亜-熙ァ-ンヴー々]+$/ &&
(!allow_noun_chains || part !~ /名詞/ || part =~ /非自立/ ||
midasi !~ /^[亜-熙ァ-ンヴー々ぁ-ん]+$/ )) || terminate
#if (midasi !~ /^[亜-熙ァ-ンヴー]+$/ && conj !~ /連用形/) || terminate
#next if count < 1
if count < 1
next if !handle_prefix
if part =~ /接頭詞/
# kludge - keep prefix w/o increasing count (cf.「ご立派」「お味噌」)
key = yomi.to_hiragana
word = midasi
last_part = part
#elsif part =~ /自立/ && conj =~ /連用形/
# hogehoge
if part =~ /接頭詞/
# kludge - keep prefix w/o increasing count (cf.「ご立派」「お味噌」)
key = yomi.to_hiragana
word = midasi
last_part = part
#elsif part =~ /自立/ && conj =~ /連用形/
# hogehoge
else
key = word = last_part = ""
key = word = last_part = ""
end
next
end

if midasi =~ /^[^亜-熙ァ-ンヴー々]+$/ && !terminate
if midasi =~ /^[^亜-熙ァ-ンヴー々]+$/ && !terminate
# nothing
else
if part =~ /接続詞|接頭詞|副詞[^可]/
# nothing - decline some parts
elsif midasi =~ /並び|及び/
# nothing - (HACK) decline conjonctions that ChaSen overlooks
elsif midasi =~ /^[ぁ-ん]+[亜-熙ァ-ンヴー々]+/
# nothing - this applies to quasi-words such as:
# に関する ニカンスル に関する 助詞-格助詞-連語
if part =~ /接続詞|接頭詞|副詞[^可]/
# nothing - decline some parts
elsif midasi =~ /並び|及び/
# nothing - (HACK) decline conjonctions that ChaSen overlooks
elsif midasi =~ /^[ぁ-ん]+[亜-熙ァ-ンヴー々]+/
# nothing - this applies to quasi-words such as:
# に関する ニカンスル に関する 助詞-格助詞-連語
else
key += yomi.to_hiragana
word += midasi
last_part = part
# asayaKify here?
key += yomi.to_hiragana
word += midasi
last_part = part
# asayaKify here?
end
end

if word =~ /^[ぁ-んー]+$/
if word =~ /^[ぁ-んー]+$/
# nothing
elsif !katakana_words && word =~ /^[ァ-ンヴー]+$/
elsif !katakana_words && word =~ /^[ァ-ンヴー]+$/
# nothing
elsif !keyword.empty? && !word.include?(keyword)
# nothing
Expand All @@ -147,19 +145,19 @@
count = 0

else
if count > 0 && part =~ /接続詞|接頭詞|副詞[^可]/
if count > 0 && part =~ /接続詞|接頭詞|副詞[^可]/
terminate = true
redo
elsif count == 0 && part =~ /接尾/
# avoid generating 「回大会」 from 「第3回大会」
# 回 カイ 回 名詞-接尾-助数詞
elsif count == 0 && part =~ /接尾/
# avoid generating 「回大会」 from 「第3回大会」
# 回 カイ 回 名詞-接尾-助数詞
key = word = last_part = ""
next
end
count += 1
key += yomi.to_hiragana
word += midasi
last_part = part
poisoned = true if part =~ /未知語/
poisoned = true if part =~ /未知語/
end
end
54 changes: 27 additions & 27 deletions convert2skk/ctdicconv.rb
Original file line number Diff line number Diff line change
@@ -1,6 +1,5 @@
#!/usr/bin/ruby -Ke
# -*- coding: euc-jp -*-
require 'jcode' if RUBY_VERSION.to_f < 1.9
#!/usr/bin/env ruby
# -*- coding: utf-8 -*-

# ctdicconv.rb -- convert china_taiwan.csv to SKK-JISYO dictionary format.
#
Expand Down Expand Up @@ -30,10 +29,11 @@
#
# Commentary:

Encoding.default_external = "euc-jis-2004"
$ANNOTATION = true
##$ANNOTATION = false

# from 「オブジェクト指向スクリプト言語ruby」p121
# from 「オブジェクト指向スクリプト言語ruby」p121
def csv_split(source, delimiter = ',')
csv = []
data = ""
Expand All @@ -45,8 +45,8 @@ def csv_split(source, delimiter = ',')
end
if /^"/ =~ data
if /[^"]"$/ =~ data or '""' == data
csv << data.sub(/^"(.*)"$/, '\1').gsub(/""/, '"')
data = ''
csv << data.sub(/^"(.*)"$/, '\1').gsub(/""/, '"')
data = ''
end
else
csv << d
Expand All @@ -60,32 +60,32 @@ def csv_split(source, delimiter = ',')
file = ARGV.shift

if not file
print "ファイルを指定して下さい\n"
print "ファイルを指定して下さい\n"
else
first = true
File.foreach(file) do |line|
if first
first = false
next
end
#中国・台湾,種別,英語見出し,漢字,日本語読み,中国語読み(カタカナ),英語標記2,漢字別名,漢字別名読み,省都,省都読み,annotation
c_t,d,e_key,kanji,j_key,c_key,english,kanji_alias,kanji_alias_key,capital,capital_key,annotation= csv_split(line.chomp)
#中国・台湾,種別,英語見出し,漢字,日本語読み,中国語読み(カタカナ),英語標記2,漢字別名,漢字別名読み,省都,省都読み,annotation
_c_t, _d,e_key,kanji,j_key,c_key,_english,kanji_alias,kanji_alias_key,_capital,_capital_key,annotation= csv_split(line.chomp)
if (e_key && !e_key.empty? && kanji && !kanji.empty?)
e_key.strip!
kanji.strip!
# 英語見出し /漢字/
# 英語見出し /漢字/
if ($ANNOTATION && annotation && !annotation.empty?)
annotation.strip!
annotation.strip!
print e_key, " /", kanji, ";", annotation, "/\n"
else
print e_key, " /", kanji, "/\n"
end

# 日本語見出し /Capitalized 英語/
# 日本語見出し /Capitalized 英語/
if (j_key && !j_key.empty?)
j_key.strip!
j_key.strip!
if ($ANNOTATION && annotation && !annotation.empty?)
annotation.strip!
annotation.strip!
print j_key, " /", e_key.capitalize, ";", annotation, "/\n"
else
print j_key, " /", e_key.capitalize, "/\n"
Expand All @@ -94,35 +94,35 @@ def csv_split(source, delimiter = ',')
end

if (j_key && !j_key.empty? && kanji && !kanji.empty?)
# 日本語見出し /漢字/
# 日本語見出し /漢字/
if ($ANNOTATION && annotation && !annotation.empty?)
annotation.strip!
print j_key, " /", kanji, ";", annotation, "/\n"
annotation.strip!
print j_key, " /", kanji, ";", annotation, "/\n"
else
print j_key, " /", kanji, "/\n"
print j_key, " /", kanji, "/\n"
end
end

if (c_key && !c_key.empty? && kanji && !kanji.empty?)
c_key.strip!
c_key.tr!("ァ-ン", "ぁ-ん")
# 中国語見出し /漢字/
c_key.tr!("ァ-ン", "ぁ-ん")
# 中国語見出し /漢字/
if ($ANNOTATION && annotation && !annotation.empty?)
print c_key, " /", kanji, ";", annotation, "/\n"
print c_key, " /", kanji, ";", annotation, "/\n"
else
print c_key, " /", kanji, "/\n"
print c_key, " /", kanji, "/\n"
end
end
# 漢字別名見出し /漢字別名/
# 漢字別名見出し /漢字別名/
if (kanji_alias && kanji_alias_key &&
!kanji_alias.empty? && !kanji_alias_key.empty?)
!kanji_alias.empty? && !kanji_alias_key.empty?)
if ($ANNOTATION && annotation && !annotation.empty?)
print kanji_alias_key, " /", kanji_alias, ";", annotation, "/\n"
print kanji_alias_key, " /", kanji_alias, ";", annotation, "/\n"
else
print kanji_alias_key, " /", kanji_alias, "/\n"
print kanji_alias_key, " /", kanji_alias, "/\n"
end
end
# 省都見出し /省都/
# 省都見出し /省都/
#if (capital && capital_key &&
# !capital.empty? && !capital_key.empty?)
# print capital_key, " /", capital, "/\n"
Expand Down
3 changes: 3 additions & 0 deletions convert2skk/dic-it2skk.rb
Original file line number Diff line number Diff line change
@@ -1,4 +1,6 @@
#!/usr/bin/env ruby
# -*- coding: utf-8 -*-

# dic-it2skk.rb -- convert dic-it dictionary to SKK-JISYO format.
#
# Copyright (C) 2003 NAKAJIMA Mikio <[email protected]>
Expand Down Expand Up @@ -32,6 +34,7 @@
# $ dic-it2skk.rb dic-it.txt > tmp.jisyo
# $ skkdic-expr2 tmp.jisyo > SKK-JISYO.dic-it
#
Encoding.default_external = "euc-jis-2004"
file = ARGV.shift
open(file).each{|line|
if !(line =~ /([^ \/]+)\/([^ ]+) *$/)
Expand Down
Loading

0 comments on commit 5d238e8

Please sign in to comment.