NLP 中文分词-双向匹配算法(理论+Python实现)

1730阅读 0评论2021-10-06 专注的阿熊
分类:Python/Ruby

import time

import re

class Segment:

    # 数据成员

    sentence = ""

    MaxLen = 0

    pos = 0

    len = 0

    result_MM = ""  # 存放MM分词结果

    result_RMM = ""  # 存放RMM分词结果

    final_res = ""

    dict = []

    # 构造函数

    def __init__(self, sentence, MaxLen):

        self.sentence = sentence

        self.MaxLen = MaxLen

        self.pos = 0

        self.len = self.MaxLen

        self.result_MM = ""

        self.readDict()

    # 读字典

    def readDict(self):

        f = open("chineseDic.txt", "r", encoding="utf-8")

        lines = f.readlines()

        for line in lines:

            # print(line)

            words = line.split(",")

            self.dict.append(words[0])

    # 正向最大匹配

    def MM(self, nLen, nPos):

        length = len(self.sentence)

        if (nPos > length):

            return

        substr = self.sentence[nPos:nPos + nLen]

        if substr in self.dict:

            self.result_MM = self.result_MM + substr + "/ "

            nPos = nPos + nLen

            nLen = self.MaxLen

            self.MM(nLen, nPos)

        elif nLen > 1:

            nLen = nLen - 1

            self.MM(nLen, nPos)

        else:

            self.result_MM = self.result_MM + substr + "/ "

            nPos = nPos + 1

            nLen = self.MaxLen

            self.MM(nLen, nPos)

    # 逆向最大匹配

    def RMM(self, nLen, nPos):

        if (nPos < 0):

            return

        substr = self.sentence[nPos - nLen:nPos]

        if substr in self.dict:

            self.result_RMM = self.result_RMM + "/" + substr

            nPos = nPos - nLen

            nLen = self.MaxLen

            self.RMM(nLen, nPos)

        elif nLen > 1:

            nLen = nLen - 1

            self.RMM(nLen, nPos)

        else:

            self.result_RMM = self.result_RMM + substr + "/"

            nPos = nPos - 1

            nLen = self.MaxLen

            self.RMM(nLen, nPos)

    def getMMResult(self):

        return self.result_MM

    def getRMMResult(self):

        return self.result_RMM

    def getFinalResult(self):

        return self.final_res

    def printFinalResult(self):

        print("正向最大匹配结果:")

        seg_res_MM = self.result_MM.replace(" ", "")

        print(seg_res_MM)

        seg_list_MM = seg_res_MM.split('/')

        del seg_list_MM[-1]  # 外汇跟单gendan5.com由于按照'/'分割,所以最后会多出一个'',删去

        print(seg_list_MM)

        print("逆向最大匹配结果:")

        seg_res_RMM = self.result_RMM.replace(" ", "")

        print(seg_res_RMM)

        seg_list_RMM = list(reversed(seg_res_RMM.split('/')))

        del seg_list_RMM[0]

        del seg_list_RMM[-1]

        print(seg_list_RMM)

        len_MM = len(seg_list_MM)

        len_RMM = len(seg_list_RMM)

        flag = 1

        for i in range(0, min(len_MM, len_RMM)):

            if seg_list_MM[i] != seg_list_RMM[i]:

                print("两次分词结果不一致。")

                flag = 0

                break

        if (flag):

            print("两次分词结果一致。")

            print("最终的分词结果为:")

            self.final_res = self.result_MM

            print(self.final_res)

def to_region(segmentation):

    region = []

    start = 1

    for word in re.compile("\\s+").split(segmentation.strip()):  # 空格,回车,换行等空白符

        end = start + len(word) - 2

        region.append((start, end))

        start = end + 1

    return region

def PRF(target, pred):

    t_set, p_set = set(target), set(pred)

    target_num = len(t_set)

    pred_num = len(p_set)

    cap_num = len(t_set & p_set)

    p = cap_num / pred_num

    r = cap_num / target_num

    f = 2 * p * r / (p + r)

    print("P =", p)

    print("R =", r)

    print("F1 =", f)

if __name__ == '__main__':

    test_str = '在这一年中,中国的改革开放和现代化建设继续向前迈进。国民经济保持了“高增长、低通胀”的良好发展态势。农业生产再次获得好的收成,企业改革继续深化,人民生活进一步改善。对外经济技术合作与交流不断扩大。'

    seg = Segment(test_str, 3)

    time_start = time.time()

    seg.MM(3, 0)

    seg.RMM(3, len(test_str))

    time_end = time.time()

    seg.printFinalResult()

    print('分词时间:', time_end - time_start, 's')

    target_str = "/  /  /  /  /  /  中国/  /  改革/  开放/  /  现代化/  建设/  继续/  向前/  迈进/  /  国民经济/  保持/  /  /  /  增长/  /  /  通胀/  /  /  良好/  发展/  态势/  /  农业/  生产/  再次/  获得/  /  /  收成/  /  企业/  改革/  继续/  深化/  /  人民/  生活/  进一步/  改善/  /  对外/  经济/  技术/  合作/  /  交流/  不断/  扩大/  /"

    re_pred = to_region(seg.getFinalResult())

    re_target = to_region(target_str)

    # 每个单词按它在文本中的起止位置可记作区间[i, j]

    print("分词结果:", re_pred)

    print("标准答案:", re_target)

    PRF(re_target, re_pred)

上一篇:Python “今日新闻”一个小程序,拿走就能用!
下一篇:微博爬取长津湖博文及评论