classMyTextDirCorpus(TextDirectoryCorpus): # 为了强制使用'utf8'编码, 我们复写了这个方法 defgetstream(self): """Yield documents from the underlying plain text collection (of one or more files). Each item yielded from this method will be considered a document by subsequent preprocessing methods. If `lines_are_documents` was set to True, items will be lines from files. Otherwise there will be one item per file, containing the entire contents of the file. """ num_texts = 0 for path in self.iter_filepaths(): withopen(path, 'rt', encoding='utf8') as f: if self.lines_are_documents: for line in f: yield line.strip() num_texts += 1 else: content = f.read().strip() yield content num_texts += 1
def__init__(self, input, **kwargs): kwargs['tokenizer'] = self.tokenizer super().__init__(input, **kwargs) deftokenizer(self, text): ifnothasattr(self, '_segmentor'): model_path = r'D:\mysites\text-characters\tcharacters\ltp\ltp_data\cws.model' segmentor = Segmentor() # 初始化实例 segmentor.load(model_path) self._segmentor = segmentor segmentor = self._segmentor return segmentor.segment(text) def__del__(self): '''释放资源''' ifhasattr(self, '_segmentor'): self._segmentor.release() try: super().__del__() except AttributeError: pass # 为了强制使用'utf8'编码, 我们复写了这个方法 defgetstream(self): """Yield documents from the underlying plain text collection (of one or more files). Each item yielded from this method will be considered a document by subsequent preprocessing methods. If `lines_are_documents` was set to True, items will be lines from files. Otherwise there will be one item per file, containing the entire contents of the file. """ num_texts = 0 for path in self.iter_filepaths(): withopen(path, 'rt', encoding='utf8') as f: if self.lines_are_documents: for line in f: yield line.strip() num_texts += 1 else: content = f.read().strip() yield content num_texts += 1
def__init__(self, input, **kwargs): kwargs['tokenizer'] = self.tokenizer super().__init__(input, **kwargs) deftokenizer(self, text): ifnothasattr(self, '_segmentor'): model_path = r'D:\mysites\text-characters\tcharacters\ltp\ltp_data\cws.model' segmentor = Segmentor() # 初始化实例 segmentor.load(model_path) self._segmentor = segmentor segmentor = self._segmentor return segmentor.segment(text) def__del__(self): '''释放资源''' ifhasattr(self, '_segmentor'): self._segmentor.release() try: super().__del__() except AttributeError: pass # 为了强制使用'utf8'编码, 我们复写了这个方法 defgetstream(self): """Yield documents from the underlying plain text collection (of one or more files). Each item yielded from this method will be considered a document by subsequent preprocessing methods. If `lines_are_documents` was set to True, items will be lines from files. Otherwise there will be one item per file, containing the entire contents of the file. """ num_texts = 0 for path in self.iter_filepaths(): withopen(path, 'rt', encoding='utf8') as f: if self.lines_are_documents: for line in f: yield line.strip() num_texts += 1 else: content = f.read().strip() yield content num_texts += 1
self.length = num_texts defget_texts_from_tokens(self): for fpath in self.iter_filepaths(): fpath = Path(fpath) token_path = fpath.parent / (fpath.name + '.cached_tokens') yield pickle.loads(token_path.read_bytes())
def__init__(self, input, **kwargs): kwargs['tokenizer'] = self.tokenizer super().__init__(input, **kwargs) deftokenizer(self, text): ifnothasattr(self, '_segmentor'): model_path = r'D:\mysites\text-characters\tcharacters\ltp\ltp_data\cws.model' segmentor = Segmentor() # 初始化实例 segmentor.load(model_path) self._segmentor = segmentor segmentor = self._segmentor return segmentor.segment(text) def__del__(self): '''释放资源''' ifhasattr(self, '_segmentor'): self._segmentor.release() try: super().__del__() except AttributeError: pass # 为了强制使用'utf8'编码, 我们复写了这个方法 defgetstream(self): """Yield documents from the underlying plain text collection (of one or more files). Each item yielded from this method will be considered a document by subsequent preprocessing methods. If `lines_are_documents` was set to True, items will be lines from files. Otherwise there will be one item per file, containing the entire contents of the file. """ num_texts = 0 for path in self.iter_filepaths(): withopen(path, 'rt', encoding='utf8') as f: if self.lines_are_documents: for line in f: yield line.strip() num_texts += 1 else: content = f.read().strip() yield content num_texts += 1