|
4 | 4 |
|
5 | 5 | import re
|
6 | 6 | from pathlib import Path
|
| 7 | +from typing import TYPE_CHECKING |
7 | 8 |
|
8 | 9 | import snowballstemmer
|
9 | 10 |
|
10 | 11 | from sphinx.search import SearchLanguage
|
11 | 12 |
|
| 13 | +if TYPE_CHECKING: |
| 14 | + from collections.abc import Iterator |
| 15 | + |
12 | 16 | try:
|
13 | 17 | import jieba # type: ignore[import-not-found]
|
14 |
| - |
15 |
| - JIEBA = True |
16 |
| - JIEBA_DEFAULT_DICT = Path(jieba.__file__).parent / jieba.DEFAULT_DICT_NAME |
| 18 | + from jieba import cut_for_search |
| 19 | + from jieba import load_userdict as jieba_load_userdict |
17 | 20 | except ImportError:
|
18 |
| - JIEBA = False |
19 |
| - JIEBA_DEFAULT_DICT = Path() |
| 21 | + JIEBA_DEFAULT_DICT = '' |
| 22 | + |
| 23 | + def jieba_load_userdict(f: str) -> None: |
| 24 | + pass |
| 25 | + |
| 26 | + def cut_for_search(sentence: str, HMM: bool = True) -> Iterator[str]: |
| 27 | + yield from () |
| 28 | + |
| 29 | +else: |
| 30 | + JIEBA_DEFAULT_DICT = ( |
| 31 | + Path(jieba.__file__, '..', jieba.DEFAULT_DICT_NAME).resolve().as_posix() |
| 32 | + ) |
| 33 | + del jieba |
20 | 34 |
|
21 | 35 | english_stopwords = {
|
22 | 36 | 'a', 'and', 'are', 'as', 'at',
|
@@ -231,18 +245,14 @@ def __init__(self, options: dict[str, str]) -> None:
|
231 | 245 | self.latin_terms: set[str] = set()
|
232 | 246 |
|
233 | 247 | def init(self, options: dict[str, str]) -> None:
|
234 |
| - if JIEBA: |
235 |
| - dict_path = options.get('dict', JIEBA_DEFAULT_DICT) |
236 |
| - if dict_path and Path(dict_path).is_file(): |
237 |
| - jieba.load_userdict(dict_path) |
| 248 | + dict_path = options.get('dict', JIEBA_DEFAULT_DICT) |
| 249 | + if dict_path and Path(dict_path).is_file(): |
| 250 | + jieba_load_userdict(str(dict_path)) |
238 | 251 |
|
239 | 252 | self.stemmer = snowballstemmer.stemmer('english')
|
240 | 253 |
|
241 | 254 | def split(self, input: str) -> list[str]:
|
242 |
| - if JIEBA: |
243 |
| - chinese: list[str] = list(jieba.cut_for_search(input)) |
244 |
| - else: |
245 |
| - chinese = [] |
| 255 | + chinese: list[str] = list(cut_for_search(input)) |
246 | 256 |
|
247 | 257 | latin1 = [term.strip() for term in self.latin1_letters.findall(input)]
|
248 | 258 | self.latin_terms.update(latin1)
|
|
0 commit comments