The Collection of classic codes with Python(updated in 2019.03.22)

11.張量維度操作(拼接、維度擴展、壓縮、轉置、重複……)

https://zhuanlan.zhihu.com/p/31495102

10.將一個字符序列轉變爲one-hot編碼

"""
例如:
輸入:['a','b','c','a']
輸出:
[ [1,0,0],
  [0,1,0],
  [0,0,1],
  [1,0,0]
]"""


def _label_string2matrix(nodes_labels_str):
    b, c = np.unique(nodes_labels_str, return_inverse=True)
    class_num = len(b)
    sample_num = len(c)
    class_num = class_num
    nodes_labels = torch.zeros((sample_num, class_num))
    i = 0
    for la in c:
        nodes_labels[i, la] = 1.0
        i = i + 1
    return nodes_labels

9. 多進程

類型一:多個進程執行相同的任務(舉行模塊切分)

from concurrent.futures import ProcessPoolExecutor

a_very_big_list = []

for item in a_very_big_list:
    """
    你需要確保這個for循環每一輪之間是互不影響的, 否則無法進行並行化的處理
    some codes include:
    1. your logic functions
    2. some parameters (read only)
    3. some variables (you want to get or return)
    """
    parameters = None
    variables = None


def _fun(list_split, parameters):
    _variables = []
    for item in list_split:
        _variables = parameters

    return _variables


def fun(a_very_big_list, parameters=None, workers=8):
    list_split = []
    step = int(len(a_very_big_list) / workers)
    for i in range(workers):
        if i != workers - 1:
            # print('slice: ', i * step, ' ', (i + 1) * step)
            split = a_very_big_list[i * step:(i + 1) * step]
        else:
            # print('slice: ', i * step)
            split = a_very_big_list[i * step:]
        list_split.append(split)

    variables = []
    print("len(wblog_content_split): ", len(list_split))
    with ProcessPoolExecutor(max_workers=workers) as executor:
        for _variables in executor.map(_fun,
                                       list_split,
                                       [parameters for i in range(workers)]):
            """
            接下來你需要把每一個進程返回的結果進行組裝,組裝的方式要根據具體的情況靈活設計,
            例如對於不受影響的dic,可以使用dic.update
            對於list,可以使用+進行拼接
            """
            variables = variables + _variables
    return variables

類型二:多個進程執行不同的任務

import multiprocessing

def fun_1(parameter):
    pass

def fun_2(parameter):
    pass

def fun_3():
    pass
def fun_4():
    pass

parameter=None
p1 = multiprocessing.Process(target=fun_1, args=(parameter,))
p2 = multiprocessing.Process(target=fun_2, args=(parameter,))
p1.start()
p2.start()
p1.join()
fun_3()
p2.join()
fun_4()
"""
進程執行的路線圖
fun_1 | fun_2
  |       |
  v       v
fun_3 | fun_4
請注意,進程之間是不共享數據的,不要企圖在類裏面這樣使用,實際上會創建好幾個類對象,並且各個進程只會修改自己類對象裏面的變量

"""

8. 打混數據並保證索引按照正常的排序

df.sample(frac=1).reset_index(drop=True)

7. 解析json字符串並返回支持屬性訪問的對象

from argparse import Namespace
import json
def json_to_object(data):
    return json.loads(data, object_hook=lambda d: Namespace(**d))

with open("default.json") as f:
    args = json_to_object(f.read())

6. 概率化編程


def add_unknown_words(word_vecs, vocab, min_df=1, k=300):
    """
    For words that occur in at least min_df documents, 
    create a separate word vector.
    same variance as pre-trained ones
    """
    for word in vocab:
        if word not in word_vecs and vocab[word] >= min_df:
            word_vecs[word] = np.random.uniform(-0.25,0.25,k)
            print(word)

def pro(pro):
  if random.random()<=pro:
    pass

5. 定義一段棄用的代碼

有時候有些函數功能我們打算棄用,但是擔心版本不兼容,爲例保持藉口的兼容性,可以仿照下面的代碼進行編程

 @deprecated("Method will be removed in 4.0.0, use self.wv.__getitem__() instead")
    def __getitem__(self, words):
        """
        Deprecated. Use self.wv.__getitem__() instead.
        Refer to the documentation for `gensim.models.keyedvectors.Word2VecKeyedVectors.__getitem__`
        """
        return self.wv.__getitem__(words)

4. 計算文件行數

def count_lines(f):
    if path.isfile(f):  # Test whether a path is a regular file
        num_lines = sum(1 for line in open(f))
        """
        上面這行代碼相當於:
            a=[1 for line in open(f)] # a=[1,1,1,1,1,...,1]
            num_lines = sum(a)
        """
        return num_lines
    else:
        return 0

3. 計算一組文件裏單詞的詞頻


#! /usr/bin/env python
# -*- coding: utf-8 -*-
from concurrent.futures import ProcessPoolExecutor
from collections import Counter


def count_words(file):
    c = Counter()
    with open(file, 'r') as f:
        for l in f:
            words = l.strip().split()
            c.update(words)
    return c


def count_textfiles(files, workers=1):
    c = Counter()
    with ProcessPoolExecutor(max_workers=workers) as executor:
        for c_ in executor.map(count_words, files):
            c.update(c_)
    return c

2. 代碼計時


from time import time
t0 = time()
your code here
t1 = time()
print('make_directed: added missing edges {}s'.format(t1 - t0))

1. 創建一個可迭代的文件對象


class WalksCorpus(object):
    def __init__(self, file_list):
        """

        :param file_list: 這是write_walks_to_disk寫在本地裏的一組文件列表
        """
        self.file_list = file_list

    def __iter__(self):
        """
        Python 中的順序類型,都是可迭代的(list, tuple, string)。
        其餘包括 dict, set, file 也是可迭代的。
        對於用戶自己實現的類型,如果提供了 __iter__() 或者 __getitem__() 方法,
        那麼該類的對象也是可迭代的。


        假如file_list=['output.walks.0','output.walks.1']其中
        'output.walks.0':
        8 2 4 8 2 31 34 28 24 28 25 28 3 10 3
        2 1 22 1 18 2 8 2 4 2 18 1 8 3 2 1
        'output.walks.1':
         32 25 26 32 29 3 33 31 9 33 16 34
         6 11 1 20 34 30 24 30 24 28 3 1 14

        那麼這個函數返回後得到的就是:
        [   [8, 2, 4, 8, 2, 31, 34, 28, 24, 28, 25, 28, 3, 10, 3],
            [2, 1, 22, 1, 18, 2, 8, 2, 4, 2, 18, 1, 8, 3, 2, 1],
            [32, 25, 26, 32, 29, 3, 33, 31, 9, 33, 16, 34 ],
            [6, 11, 1, 20, 34, 30, 24, 30, 24, 28, 3, 1, 14]
        ]


        更多關於迭代和yield的內容,可以參考博文
        [Python 中的黑暗角落(一):理解 yield 關鍵字](https://liam0205.me/2017/06/30/understanding-yield-in-python/)
        :return:
        """
        for file in self.file_list:
            with open(file, 'r') as f:
                for line in f:
                    yield line.split()


walk_files="your_file.txt"
walks = WalksCorpus(walk_files)
發表評論
所有評論
還沒有人評論,想成為第一個評論的人麼? 請在上方評論欄輸入並且點擊發布.
相關文章