270 lines
10 KiB
JavaScript
270 lines
10 KiB
JavaScript
// 其它字典对象
|
|
const Word = require("./Word")
|
|
const {shakeDom, log, shakeDomFocus} = require('./Utility')
|
|
|
|
const os = require('os')
|
|
|
|
class DictOther {
|
|
constructor(fileContent, filename, filePath, seperator, dictFormat) {
|
|
this.dictTypeName = 'DictOther'
|
|
this.filePath = filePath // 文件路径
|
|
this.filename = filename // 文件路径
|
|
this.lastIndex = 0 // 最后一个 Index 的值,用于新添加词时,作为唯一的 id 传入
|
|
this.seperator = seperator ||' ' // 默认间隔符为空格
|
|
this.dictFormat = dictFormat || 'cww' // 码表格式: 一码多词什么的 cww: 一码多词 | wc: 一词一码 | cw: 一码一词
|
|
this.characterMap = new Map() // 单字码表,用于根据此生成词语码表
|
|
this.wordsOrigin = this.getDictWordsInNormalMode(fileContent)
|
|
}
|
|
// 总的词条数量
|
|
get countDictOrigin(){
|
|
return this.wordsOrigin.length
|
|
}
|
|
|
|
// 设置 seperator
|
|
setSeperator(seperator){
|
|
this.seperator = seperator
|
|
}
|
|
// 设置 dictFormat
|
|
setDictFormat(dictFormat){
|
|
this.dictFormat = dictFormat
|
|
}
|
|
|
|
// 获取指定字数的词条组
|
|
getWordsLengthOf(length){
|
|
switch (length){
|
|
case 0:
|
|
return this.wordsOrigin
|
|
case 1:
|
|
case 2:
|
|
case 3:
|
|
case 4:
|
|
return this.wordsOrigin.filter(word => word.word.length === length)
|
|
default:
|
|
return this.wordsOrigin.filter(word => word.word.length > 4)
|
|
}
|
|
}
|
|
|
|
// 查重,返回重复定义的字词
|
|
// includeCharacter 当包含单字时
|
|
getRepetitionWords(filterSingleCharacter){
|
|
let startPoint = new Date().getTime()
|
|
let wordMap = new Map()
|
|
let repetitionWords = []
|
|
this.wordsOrigin.forEach(word => {
|
|
if (filterSingleCharacter){
|
|
if (wordMap.has(word.word) && word.word.length === 1){
|
|
repetitionWords.push(word)
|
|
let matchedWord = wordMap.get(word.word)
|
|
if (matchedWord) repetitionWords.push(matchedWord)
|
|
} else { // 如果 map 中没有这个词的记录,添加这个记录
|
|
wordMap.set(word.word, word)
|
|
}
|
|
} else {
|
|
if (wordMap.has(word.word) && word.word.length > 1){ // 单字没必要查重,所以这里只搜索 2 个字以上的词
|
|
repetitionWords.push(word)
|
|
let matchedWord = wordMap.get(word.word)
|
|
if (matchedWord) repetitionWords.push(matchedWord)
|
|
} else { // 如果 map 中没有这个词的记录,添加这个记录
|
|
wordMap.set(word.word, word)
|
|
}
|
|
}
|
|
})
|
|
// 排序后再去除重复项
|
|
repetitionWords.sort((a, b) => {
|
|
// log(a.word + a.code, b.word + b.code)
|
|
return (a.word + a.code) > (b.word + b.code) ? 1 : -1
|
|
})
|
|
log('重复词条数量:未去重之前 ', repetitionWords.length)
|
|
|
|
for (let i = 0; i < repetitionWords.length - 1; i++) {
|
|
if (repetitionWords[i].id === repetitionWords[i + 1].id ) {
|
|
repetitionWords.splice(i,1)
|
|
i = i - 1
|
|
}
|
|
}
|
|
log(`查重完成,用时 ${new Date().getTime() - startPoint} ms`)
|
|
log('词条字典数量: ', wordMap.size)
|
|
log('重复词条数量: ', repetitionWords.length)
|
|
log('重复 + 词条字典 = ', repetitionWords.length + wordMap.size)
|
|
return repetitionWords
|
|
}
|
|
|
|
// 返回所有 word
|
|
getDictWordsInNormalMode(fileContent){
|
|
let startPoint = new Date().getTime()
|
|
let EOL = this.getFileEOLFrom(fileContent)
|
|
let lines = fileContent.split(EOL) // 拆分词条与编码成单行
|
|
this.lastIndex = lines.length + 1
|
|
// 如果为纯词模式,就使用所有的行,否则就根据分隔符进行筛选
|
|
let linesValid = this.dictFormat === 'w'? lines: lines.filter(item => item.indexOf(this.seperator) > -1)
|
|
let words = []
|
|
log('正常词条的行数:',linesValid.length)
|
|
linesValid.forEach(item => {
|
|
let currentWords = this.getWordsFromLine(item)
|
|
words.push(...currentWords) // 拼接词组
|
|
currentWords.forEach(currentWord => {
|
|
if (currentWord.word.length === 1
|
|
&& currentWord.code.length >=2
|
|
&& !this.characterMap.has(currentWord.word)) // map里不存在这个字
|
|
{ // 编码长度为 4 的单字
|
|
this.characterMap.set(currentWord.word, currentWord.code)
|
|
}
|
|
})
|
|
})
|
|
log(`处理文件完成,共:${words.length } 条,用时 ${new Date().getTime() - startPoint} ms`)
|
|
return words
|
|
}
|
|
|
|
// 排序
|
|
sort(){
|
|
let startPoint = new Date().getTime()
|
|
this.wordsOrigin.sort((a,b) => a.code < b.code ? -1: 1)
|
|
log(`排序用时 ${new Date().getTime() - startPoint} ms`)
|
|
}
|
|
|
|
|
|
// 依次序添加 words
|
|
addWordsInOrder(words){
|
|
let startPoint = new Date().getTime()
|
|
words.forEach(word => {
|
|
this.addWordToDictInOrder(word)
|
|
})
|
|
log(`添加 ${words.length } 条词条到指定码表, 用时 ${new Date().getTime() - startPoint} ms`)
|
|
}
|
|
|
|
// 依次序添加 word
|
|
addWordToDictInOrder(word){
|
|
let insetPosition = null // 插入位置 index
|
|
this.sort() // 插入之前排序码表
|
|
for (let i=0; i<this.wordsOrigin.length-1; i++){ // -1 为了避免下面 i+1 为 undefined
|
|
if (word.code >= this.wordsOrigin[i] && word.code <= this.wordsOrigin[i+1].code){
|
|
insetPosition = i + 1
|
|
break
|
|
}
|
|
}
|
|
if (!insetPosition){ // 没有匹配到任何位置,添加到结尾
|
|
insetPosition = this.wordsOrigin.length
|
|
}
|
|
let wordInsert = word.clone() // 断开与别一个 dict 的引用链接,新建一个 word 对象,不然两个 dict 引用同一个 word
|
|
wordInsert.setId(this.lastIndex++) // 给新的 words 一个新的唯一 id
|
|
this.wordsOrigin.splice(insetPosition, 0, wordInsert)
|
|
}
|
|
|
|
// 判断码表文件的换行符是 \r\n 还是 \n
|
|
getFileEOLFrom(fileContent){
|
|
if(fileContent.indexOf('\r\n') > 0){
|
|
return '\r\n'
|
|
} else {
|
|
return '\n'
|
|
}
|
|
}
|
|
|
|
|
|
// 删除词条
|
|
deleteWords(wordIdSet){
|
|
this.wordsOrigin = this.wordsOrigin.filter(item => !wordIdSet.has(item.id))
|
|
}
|
|
|
|
// 转为 String
|
|
toYamlString(){
|
|
let fileContentString = ''
|
|
this.wordsOrigin.forEach(item =>{
|
|
fileContentString = fileContentString + item.toYamlString() + os.EOL
|
|
})
|
|
return fileContentString
|
|
}
|
|
|
|
toExportString(seperator, dictFormat){
|
|
let startPoint = new Date().getTime()
|
|
let fileContentString = ''
|
|
switch (dictFormat){
|
|
case 'cww':
|
|
let codeMap = new Map() // code: [word, word, word]
|
|
this.wordsOrigin.forEach((word, index) => {
|
|
let code = word.code
|
|
if (codeMap.has(code)){ // 用 map 记录所有 code, 如果有就添加到对应的 value 中,没有就新增 map item
|
|
codeMap.set(code, codeMap.get(code).concat(word))
|
|
} else {
|
|
codeMap.set(code, [word])
|
|
}
|
|
})
|
|
codeMap.forEach((wordArray, code) => {
|
|
let oneCodewordsString = ''
|
|
wordArray.forEach(item => {oneCodewordsString = oneCodewordsString.concat(seperator + item.word)}) // seperater + wordsString
|
|
fileContentString = fileContentString.concat(code, oneCodewordsString, os.EOL)
|
|
})
|
|
log(`词条文本已生成,用时 ${new Date().getTime() - startPoint} ms`)
|
|
return fileContentString
|
|
case 'cw':
|
|
this.wordsOrigin.forEach(word => {
|
|
fileContentString = fileContentString.concat(word.toFileString(seperator, true), os.EOL)
|
|
})
|
|
log(`词条文本已生成,用时 ${new Date().getTime() - startPoint} ms`)
|
|
return fileContentString
|
|
case 'wc':
|
|
this.wordsOrigin.forEach(word => {
|
|
fileContentString = fileContentString.concat(word.toFileString(seperator, false), os.EOL)
|
|
})
|
|
log(`词条文本已生成,用时 ${new Date().getTime() - startPoint} ms`)
|
|
return fileContentString
|
|
case 'w':
|
|
this.wordsOrigin.forEach(word => {
|
|
fileContentString = fileContentString.concat(word.word, os.EOL)
|
|
})
|
|
log(`词条文本已生成,用时 ${new Date().getTime() - startPoint} ms`)
|
|
return fileContentString
|
|
}
|
|
}
|
|
|
|
// 在 origin 中调换两个词条的位置
|
|
exchangePositionInOrigin(word1, word2){
|
|
// 确保 word1 在前
|
|
if (parseInt(word1.id) > parseInt(word2.id)){
|
|
let temp = word1
|
|
word1 = word2
|
|
word2 = temp
|
|
}
|
|
for(let i=0; i<this.wordsOrigin.length; i++){
|
|
let tempWord = this.wordsOrigin[i]
|
|
if (tempWord.isEqualTo(word1)){
|
|
this.wordsOrigin[i] = word2
|
|
}
|
|
if (tempWord.isEqualTo(word2)){
|
|
this.wordsOrigin[i] = word1
|
|
}
|
|
}
|
|
}
|
|
|
|
// 从一条词条字符串中获取 word 对象
|
|
// 一编码对应多词
|
|
getWordsFromLine(lineStr){
|
|
let wordArray = lineStr.split(this.seperator)
|
|
let words = []
|
|
let code, word
|
|
switch (this.dictFormat){
|
|
case 'cww':
|
|
code = wordArray[0]
|
|
for(let i=1; i<wordArray.length;i++){
|
|
words.push(new Word(this.lastIndex, code, wordArray[i]))
|
|
this.lastIndex = this.lastIndex + 1
|
|
}
|
|
return words
|
|
case 'cw':
|
|
code = wordArray[0]
|
|
word = wordArray[1]
|
|
return [new Word(this.lastIndex++, code, word)]
|
|
case 'wc':
|
|
word = wordArray[0]
|
|
code = wordArray[1]
|
|
return [new Word(this.lastIndex++, code, word)]
|
|
case 'w':
|
|
word = wordArray[0]
|
|
// code = getCodeFromWord(word)
|
|
return [new Word(this.lastIndex++, '', word)]
|
|
}
|
|
}
|
|
}
|
|
|
|
|
|
module.exports = DictOther
|