from pathlib import Path
import pandas as pd
paths = list(Path('../data/text').iterdir())
labels = []
texts = []
for path in paths:
for filepath in path.glob('*.txt'):
if not filepath.name == 'LICENSE.txt':
with open(filepath) as f:
next(f)
next(f)
text = f.read().replace('\u3000','').replace('\n','')
texts.append(text)
labels.append(path.name)
news_df = pd.DataFrame({
'label': labels,
'text': texts
})
news_df.to_csv('../data/news.csv.gz', compression='gzip', index=False)