文章目錄
11.張量維度操作(拼接、維度擴展、壓縮、轉置、重複……)
https://zhuanlan.zhihu.com/p/31495102
10.將一個字符序列轉變爲one-hot編碼
"""
例如:
輸入:['a','b','c','a']
輸出:
[ [1,0,0],
[0,1,0],
[0,0,1],
[1,0,0]
]"""
def _label_string2matrix(nodes_labels_str):
b, c = np.unique(nodes_labels_str, return_inverse=True)
class_num = len(b)
sample_num = len(c)
class_num = class_num
nodes_labels = torch.zeros((sample_num, class_num))
i = 0
for la in c:
nodes_labels[i, la] = 1.0
i = i + 1
return nodes_labels
9. 多進程
類型一:多個進程執行相同的任務(舉行模塊切分)
from concurrent.futures import ProcessPoolExecutor
a_very_big_list = []
for item in a_very_big_list:
"""
你需要確保這個for循環每一輪之間是互不影響的, 否則無法進行並行化的處理
some codes include:
1. your logic functions
2. some parameters (read only)
3. some variables (you want to get or return)
"""
parameters = None
variables = None
def _fun(list_split, parameters):
_variables = []
for item in list_split:
_variables = parameters
return _variables
def fun(a_very_big_list, parameters=None, workers=8):
list_split = []
step = int(len(a_very_big_list) / workers)
for i in range(workers):
if i != workers - 1:
# print('slice: ', i * step, ' ', (i + 1) * step)
split = a_very_big_list[i * step:(i + 1) * step]
else:
# print('slice: ', i * step)
split = a_very_big_list[i * step:]
list_split.append(split)
variables = []
print("len(wblog_content_split): ", len(list_split))
with ProcessPoolExecutor(max_workers=workers) as executor:
for _variables in executor.map(_fun,
list_split,
[parameters for i in range(workers)]):
"""
接下來你需要把每一個進程返回的結果進行組裝,組裝的方式要根據具體的情況靈活設計,
例如對於不受影響的dic,可以使用dic.update
對於list,可以使用+進行拼接
"""
variables = variables + _variables
return variables
類型二:多個進程執行不同的任務
import multiprocessing
def fun_1(parameter):
pass
def fun_2(parameter):
pass
def fun_3():
pass
def fun_4():
pass
parameter=None
p1 = multiprocessing.Process(target=fun_1, args=(parameter,))
p2 = multiprocessing.Process(target=fun_2, args=(parameter,))
p1.start()
p2.start()
p1.join()
fun_3()
p2.join()
fun_4()
"""
進程執行的路線圖
fun_1 | fun_2
| |
v v
fun_3 | fun_4
請注意,進程之間是不共享數據的,不要企圖在類裏面這樣使用,實際上會創建好幾個類對象,並且各個進程只會修改自己類對象裏面的變量
"""
8. 打混數據並保證索引按照正常的排序
df.sample(frac=1).reset_index(drop=True)
7. 解析json字符串並返回支持屬性訪問的對象
from argparse import Namespace
import json
def json_to_object(data):
return json.loads(data, object_hook=lambda d: Namespace(**d))
with open("default.json") as f:
args = json_to_object(f.read())
6. 概率化編程
def add_unknown_words(word_vecs, vocab, min_df=1, k=300):
"""
For words that occur in at least min_df documents,
create a separate word vector.
same variance as pre-trained ones
"""
for word in vocab:
if word not in word_vecs and vocab[word] >= min_df:
word_vecs[word] = np.random.uniform(-0.25,0.25,k)
print(word)
def pro(pro):
if random.random()<=pro:
pass
5. 定義一段棄用的代碼
有時候有些函數功能我們打算棄用,但是擔心版本不兼容,爲例保持藉口的兼容性,可以仿照下面的代碼進行編程
@deprecated("Method will be removed in 4.0.0, use self.wv.__getitem__() instead")
def __getitem__(self, words):
"""
Deprecated. Use self.wv.__getitem__() instead.
Refer to the documentation for `gensim.models.keyedvectors.Word2VecKeyedVectors.__getitem__`
"""
return self.wv.__getitem__(words)
4. 計算文件行數
def count_lines(f):
if path.isfile(f): # Test whether a path is a regular file
num_lines = sum(1 for line in open(f))
"""
上面這行代碼相當於:
a=[1 for line in open(f)] # a=[1,1,1,1,1,...,1]
num_lines = sum(a)
"""
return num_lines
else:
return 0
3. 計算一組文件裏單詞的詞頻
#! /usr/bin/env python
# -*- coding: utf-8 -*-
from concurrent.futures import ProcessPoolExecutor
from collections import Counter
def count_words(file):
c = Counter()
with open(file, 'r') as f:
for l in f:
words = l.strip().split()
c.update(words)
return c
def count_textfiles(files, workers=1):
c = Counter()
with ProcessPoolExecutor(max_workers=workers) as executor:
for c_ in executor.map(count_words, files):
c.update(c_)
return c
2. 代碼計時
from time import time
t0 = time()
your code here
t1 = time()
print('make_directed: added missing edges {}s'.format(t1 - t0))
1. 創建一個可迭代的文件對象
class WalksCorpus(object):
def __init__(self, file_list):
"""
:param file_list: 這是write_walks_to_disk寫在本地裏的一組文件列表
"""
self.file_list = file_list
def __iter__(self):
"""
Python 中的順序類型,都是可迭代的(list, tuple, string)。
其餘包括 dict, set, file 也是可迭代的。
對於用戶自己實現的類型,如果提供了 __iter__() 或者 __getitem__() 方法,
那麼該類的對象也是可迭代的。
假如file_list=['output.walks.0','output.walks.1']其中
'output.walks.0':
8 2 4 8 2 31 34 28 24 28 25 28 3 10 3
2 1 22 1 18 2 8 2 4 2 18 1 8 3 2 1
'output.walks.1':
32 25 26 32 29 3 33 31 9 33 16 34
6 11 1 20 34 30 24 30 24 28 3 1 14
那麼這個函數返回後得到的就是:
[ [8, 2, 4, 8, 2, 31, 34, 28, 24, 28, 25, 28, 3, 10, 3],
[2, 1, 22, 1, 18, 2, 8, 2, 4, 2, 18, 1, 8, 3, 2, 1],
[32, 25, 26, 32, 29, 3, 33, 31, 9, 33, 16, 34 ],
[6, 11, 1, 20, 34, 30, 24, 30, 24, 28, 3, 1, 14]
]
更多關於迭代和yield的內容,可以參考博文
[Python 中的黑暗角落(一):理解 yield 關鍵字](https://liam0205.me/2017/06/30/understanding-yield-in-python/)
:return:
"""
for file in self.file_list:
with open(file, 'r') as f:
for line in f:
yield line.split()
walk_files="your_file.txt"
walks = WalksCorpus(walk_files)