ลอง wordcutpy บน pypy3

#python #pypy #wordcut #tokenizer
3 มีนาคม 2562
พอทดสอบกับไฟล์ 11MB ใช้ pypy3 ทำให้ wordcutpy เร็วขึ้นเกิน 2 เท่า! คือใช้เวลาจาก 16 วินาที เหลือไม่ถึง 8 วินาที
(base) [vee@mint310 wiki]$ python3 wordcutpy.py 
16598
(base) [vee@mint310 wiki]$ sudo docker run -it --rm --name my-running-script -v "$PWD":/usr/src/myapp -w /usr/src/myapp pypy:3 pypy3 wordcutpy.py
7833
(base) [vee@mint310 wiki]$ python3 wordcutpy.py 
16093
(base) [vee@mint310 wiki]$ sudo docker run -it --rm --name my-running-script -v "$PWD":/usr/src/myapp -w /usr/src/myapp pypy:3 pypy3 wordcutpy.py
7821
(base) [vee@mint310 wiki]$ python3 wordcutpy.py 
16272
(base) [vee@mint310 wiki]$ sudo docker run -it --rm --name my-running-script -v "$PWD":/usr/src/myapp -w /usr/src/myapp pypy:3 pypy3 wordcutpy.py
7810
```

`

`

```python
# wordcutpy.py
# การใช้ wordcutpy ที่ถูกต้องคือ copy & paste เลย ไม่ต้องใช้ pip 😅
# แล้วก็ copy bigthai.txt มาไว้ folder เดียวกัน

import sys
import re

class PrefixTree(object):
    def __init__(self, members_with_payload):
        self.tab = {}
        if members_with_payload is None:
            return 
        sorted_members_with_payload = sorted(members_with_payload,
                                             key=lambda i: i[0])

        for i in range(len(sorted_members_with_payload)):
            members, payload = sorted_members_with_payload[i]
            row_no = 0
            for j in range(len(members)):
                is_terminal = len(members) == j + 1
                member = members[j]
                key = (row_no, j, member)
                if key in self.tab:
                    row_no = self.tab[key][0]
                else:
                    val = (i, is_terminal, payload if is_terminal else None)
                    self.tab[key] = val
                    row_no = i

    def lookup(self, i, offset, member):
        key = (i, offset, member)
        if key not in self.tab:
            return None
        return self.tab[key]

UNK   = 1
DICT  = 2
INIT  = 3
LATIN = 4
PUNC  = 5

def is_better(link0, link1):
    if link0 is None:
        return True

    if link1["unk"] < link0["unk"]:
        return True

    if link1["w"] < link0["w"]:
        return True

    return False

def build_path(dix, s):
    left_boundary = 0
    dict_acc_list = []

    path = [{"p":None, "w": 0, "unk": 0, "type": INIT}]

    latin_s = None
    latin_e = None

    punc_s = None
    punc_e = None

    for i, ch in enumerate(s):
        dict_acc_list.append({"s":i, "p":0, "final":False})

        # Update dict acceptors
        _dict_acc_list = dict_acc_list
        dict_acc_list = []                        
        for acc in _dict_acc_list:
            offset = i - acc["s"]
            child = dix.lookup(acc["p"], offset, ch)
            if child is not None:
                child_p, is_final, payload = child
                dict_acc_list.append({"s":acc["s"], "p": child_p,
                                      "final":is_final})

        # latin words
        if latin_s is None:
            if re.match(u"[A-Za-z]", ch):
                latin_s = i

        if latin_s is not None:            
            if re.match(u"[A-Za-z]", ch):
                if i + 1 == len(s) or re.match(u"[A-Za-z]", s[i + 1]):
                    latin_e = i
            else:
                latin_s = None
                latin_e = None

        # puncuation
        if punc_s is None:
            if ch == " ":
                punc_s = i

        if punc_s is not None:
            if ch == " ":
                if len(s) == i + 1 or s[i + 1] != " ":
                    punc_e = i
            else:
                punc_s = None
                punc_e = None

        # select link
        link = None

        # links from wordlist
        for acc in dict_acc_list:
            if acc["final"]:
                p_link = path[acc["s"]]
                _link = {"p": acc["s"], 
                         "w": p_link["w"] + 1, 
                         "unk": p_link["unk"],
                         "type": DICT}
                if is_better(link, _link):
                    link = _link

        # link from latin word
        if latin_s is not None and latin_e is not None:
            p_link = path[latin_s]
            _link = {"p": latin_s, 
                     "w": p_link["w"] + 1, 
                     "unk": p_link["unk"],
                     "type": LATIN}
            if is_better(link, _link):
                link = _link

        # link from puncuation
        if punc_s is not None and punc_e is not None:                
            p_link = path[punc_s]
            _link = {"p": punc_s, 
                     "w": p_link["w"] + 1, 
                     "unk": p_link["unk"],
                     "type": PUNC}
            if is_better(link, _link):
                link = _link

        # fallback
        if link is None:
            p_link = path[left_boundary]
            link = {"p": left_boundary, 
                    "w": p_link["w"] + 1,
                    "unk": p_link["unk"] + 1,
                    "type": UNK}
        path.append(link)
        if link["type"] != UNK:
            left_boundary = i
    return path

def path_to_tokens(txt, path):
    if len(path) < 2:
        return None

    e = len(path) - 1
    toks = []

    while True:
        link = path[e]
        s = link["p"]
        if s is None:
            break
        toks.append(txt[s:e])
        e = s

    toks.reverse()
    return toks

def tokenize(dix, txt):
    if txt is None or txt == "":
        return []
    path = build_path(dix, txt)
    return path_to_tokens(txt, path)

class Wordcut(object):
    def __init__(self, wordlist):
        self.dix = PrefixTree([(word, None) for word in wordlist])


    @classmethod
    def bigthai(cls):
        import os
        "Initialize from bigthai"
        fileDir =  os.path.dirname(__file__)
        filename = os.path.join(fileDir, 'bigthai.txt')
        with open(filename) as dict_file:

            word_list = list(set([w.rstrip() for w in dict_file.readlines()]))
            word_list.sort()
            return cls(word_list)

    def tokenize(self, s):
        return tokenize(self.dix, s)

wordcut = Wordcut.bigthai()

import time

t1 = int(round(time.time() * 1000))

with open("wiki_plain_100k.txt") as fi:
    with open("wiki.cut", "w") as fo:
        for line in fi:
            line = line.strip()
            print(" ".join(wordcut.tokenize(line)), file=fo)

t2 = int(round(time.time() * 1000))

print(t2-t1)

# LICENSE: LGPLv3

```

`

https://github.com/veer66/wordcutpy
DEV Community

ลอง wordcutpy บน pypy3

Top comments (0)