本地加載20news

第一步:下載,20newsbydate.tar.gz
第二步:/data0/liuyongkang/scikit_learn_data/20news_home
下載好的文件,放在次文件夾下。
第二步: 修改代碼
~/.conda/envs/tf1.9g/lib/python3.6/site-packages/sklearn/datasets$ vim _twenty_newsgroups.py

def _download_20newsgroups(target_dir, cache_path):
65 “”“Download the 20 newsgroups data and stored it as a zipped pickle.”""
66 target_dir = "/data0/liuyongkang/scikit_learn_data/20news_home/"
67 train_path = os.path.join(target_dir, TRAIN_FOLDER)
68 test_path = os.path.join(target_dir, TEST_FOLDER)
69
70 #if not os.path.exists(target_dir):
71 # os.makedirs(target_dir)
72

73 # logger.info(“Downloading dataset from %s (14 MB)”, ARCHIVE.url)
74 #archive_path = _fetch_remote(ARCHIVE, dirname=target_dir)
75
76 #logger.debug(“Decompressing %s”, archive_path)
77 archive_path = "/data0/liuyongkang/scikit_learn_data/20news_home/20newsbydate.tar.gz"

78 tarfile.open(archive_path, “r:gz”).extractall(path=target_dir)
79 #os.remove(archive_path)
80
81 # Store a zipped pickle
82 cache = dict(train=load_files(train_path, encoding=‘latin1’),
83 test=load_files(test_path, encoding=‘latin1’))
84 compressed_content = codecs.encode(pickle.dumps(cache), ‘zlib_codec’)
85 with open(cache_path, ‘wb’) as f:
86 f.write(compressed_content)
87
88 shutil.rmtree(target_dir)
89 return cache
第四步:測試

from sklearn.datasets import fetch_20newsgroups
from pprint import pprint
newsgroups_train = fetch_20newsgroups(subset=‘train’)
print(newsgroups_train.filenames.shape) # (11314,)
print(newsgroups_train.target.shape) # (11314,)

newsgroups_test = fetch_20newsgroups(subset=‘test’)
print(newsgroups_test.filenames.shape) # (7532,)
print(newsgroups_test.target.shape) # (7532,)

發表評論
所有評論
還沒有人評論,想成為第一個評論的人麼? 請在上方評論欄輸入並且點擊發布.
相關文章