實戰演習(十二)——基於關聯規則分析用戶行爲頻繁項集的關鍵頁面

筆者希望在平日的工作學習中,挖掘數據的價值,找尋數據的祕密,筆者認爲,數據的價值不僅僅只體現在企業中,個人也可以體會到數據的魅力,用技術力量探索行爲密碼,讓大數據助跑每一個人,歡迎直筒們關注我的公衆號,大家一起討論數據中的那些有趣的事情。

我的公衆號爲:livandata

啤酒尿布的案例是引發大數據思考的一個非常重要的案例,自從這個案例出現之後,對其進行深度研究的人員不計其數,本文是基於網站頁面,對這一案例進行借鑑引用,將用戶訪問的頁面看成是對應的產品,通過頻繁項集關聯規則來分析訪問某一頁面的客戶還會訪問哪些其他的案例,進而分析用戶訪問頁面之間的關聯規則。

我們在設計頁面時需要了解到用戶關鍵訪問哪些頁面,同時根據某個關鍵頁面可以確定用戶經過哪幾個重要頁面進入到了關鍵頁面,進而確定重要頁面,優化客戶的訪問路徑。

在進行頁面關聯規則的訪問的過程中,比較大的問題即爲數據量的問題,用戶行爲數據中可以獲取到用戶對頁面的訪問信息,但是由於這一信息量較大,在應用這一案例的過程中往往需要進行分佈式處理,基於使用的工具限制,此處基於python分批文件讀取來模擬分佈式過程。

具體代碼如下:

1、Apriori算法:

#coding=utf-8
# 個人公衆號:livandata
import sys
def apriori(D, minSup):
    C1 = {}
    for T in D:
        for I in T:
            if I in C1:
                C1[I] += 1
            else:
                C1[I] = 1
    print(C1)
    _keys1 = C1.keys()
    keys1 = []
    for i in _keys1:
        keys1.append([i])
    n = len(D)
    cutKeys1 = []
    for k in keys1[:]:
        if C1[k[0]]*1.0/n >= minSup:
            cutKeys1.append(k)
    cutKeys1.sort()
    keys = cutKeys1
    all_keys = []
    all_C = []
    while keys!= []:
        C = getC(D, keys)
        cutKeys, curC = getCutKeys(keys, C, minSup, len(D))
        for key in cutKeys:
            all_keys.append(key)
        for c in curC:
            all_C.append(c)
        keys = aproiri_gen(cutKeys)
    return all_keys, all_C

def getC(D, keys):
    '''對keys中的每一個key進行計數'''
    C = []
    for key in keys:
        c = 0
        for T in D:
            have = True
            for k in key:
                if k not in T:
                    have = False
            if have:
                c += 1
        C.append(c)
    return C

def getCutKeys(keys, C, minSup, length):
    '''剪枝步'''
    keyss = []
    Cs = []
    for i, key in enumerate(keys):
        if float(C[i]) / length >= minSup:
            keyss.append(key)
            Cs.append(C[i])
    return keyss, Cs

def keyInT(key, T):
    '''判斷項key是否在數據庫中某一元組T中'''
    for k in key:
        if k not in T:      # 只要有一個不匹配,就返回False
            return False
    return True

def aproiri_gen(keys1):
    '''連接步'''
    keys2 = []
    for k1 in keys1:
        for k2 in keys1:
            if k1 != k2:
                key = []
                for k in k1:
                    if k not in key:
                        key.append(k)
                for k in k2:
                    if k not in key:
                        key.append(k)
                key.sort()
                if key not in keys2:
                    keys2.append(key)

    return keys2

2、FP_tree算法:

# encoding: utf-8
# 個人公衆號:livandata
from collections import defaultdict, namedtuple

# original author information, this verison is updated by lina.
__license__ = 'MIT License'
def find_frequent_itemsets(transactions, minimum_support, include_support=False):
    """
    Find frequent itemsets in the given transactions using FP-growth. This
    function returns a generator instead of an eagerly-populated list of items.

    The `transactions` parameter can be any iterable of iterables of items.
    `minimum_support` should be an integer specifying the minimum number of
    occurrences of an itemset for it to be accepted.

    Each item must be hashable (i.e., it must be valid as a member of a
    dictionary or a set).

    If `include_support` is true, yield (itemset, support) pairs instead of
    just the itemsets.
    """
    items = defaultdict(lambda: 0)  # mapping from items to their supports

    # Load the passed-in transactions and count the support that individual
    # items have.
    for transaction in transactions:
        for item in transaction:
            items[item] += 1

    # Remove infrequent items from the item support dictionary.
    items = dict((item, support) for item, support in items.items()
        if support >= minimum_support)

    # Build our FP-tree. Before any transactions can be added to the tree, they
    # must be stripped of infrequent items and their surviving items must be
    # sorted in decreasing order of frequency.
    def clean_transaction(transaction):
        transaction = filter(lambda v: v in items, transaction)
        transaction_list = list(transaction)   # 爲了防止變量在其他部分調用,這裏引入臨時變量transaction_list
        transaction_list.sort(key=lambda v: items[v], reverse=True)
        return transaction_list

    master = FPTree()
    for transaction in map(clean_transaction, transactions):
        master.add(transaction)

    def find_with_suffix(tree, suffix):
        for item, nodes in tree.items():
            support = sum(n.count for n in nodes)
            if support >= minimum_support and item not in suffix:
                # New winner!
                found_set = [item] + suffix
                yield (found_set, support) if include_support else found_set

                # Build a conditional tree and recursively search for frequent
                # itemsets within it.
                cond_tree = conditional_tree_from_paths(tree.prefix_paths(item))
                for s in find_with_suffix(cond_tree, found_set):
                    yield s # pass along the good news to our caller

    # Search for frequent itemsets, and yield the results we find.
    for itemset in find_with_suffix(master, []):
        yield itemset

class FPTree(object):
    """
    An FP tree.

    This object may only store transaction items that are hashable
    (i.e., all items must be valid as dictionary keys or set members).
    """

    Route = namedtuple('Route', 'head tail')

    def __init__(self):
        # The root node of the tree.
        self._root = FPNode(self, None, None)

        # A dictionary mapping items to the head and tail of a path of
        # "neighbors" that will hit every node containing that item.
        self._routes = {}

    @property
    def root(self):
        """The root node of the tree."""
        return self._root

    def add(self, transaction):
        """Add a transaction to the tree."""
        point = self._root

        for item in transaction:
            next_point = point.search(item)
            if next_point:
                # There is already a node in this tree for the current
                # transaction item; reuse it.
                next_point.increment()
            else:
                # Create a new point and add it as a child of the point we're
                # currently looking at.
                next_point = FPNode(self, item)
                point.add(next_point)

                # Update the route of nodes that contain this item to include
                # our new node.
                self._update_route(next_point)

            point = next_point

    def _update_route(self, point):
        """Add the given node to the route through all nodes for its item."""
        assert self is point.tree

        try:
            route = self._routes[point.item]
            route[1].neighbor = point # route[1] is the tail
            self._routes[point.item] = self.Route(route[0], point)
        except KeyError:
            # First node for this item; start a new route.
            self._routes[point.item] = self.Route(point, point)

    def items(self):
        """
        Generate one 2-tuples for each item represented in the tree. The first
        element of the tuple is the item itself, and the second element is a
        generator that will yield the nodes in the tree that belong to the item.
        """
        for item in self._routes.keys():
            yield (item, self.nodes(item))

    def nodes(self, item):
        """
        Generate the sequence of nodes that contain the given item.
        """

        try:
            node = self._routes[item][0]
        except KeyError:
            return

        while node:
            yield node
            node = node.neighbor

    def prefix_paths(self, item):
        """Generate the prefix paths that end with the given item."""

        def collect_path(node):
            path = []
            while node and not node.root:
                path.append(node)
                node = node.parent
            path.reverse()
            return path

        return (collect_path(node) for node in self.nodes(item))

    def inspect(self):
        print('Tree:')
        self.root.inspect(1)

        print
        print('Routes:')
        for item, nodes in self.items():
            print('  %r' % item)
            for node in nodes:
                print('    %r' % node)

def conditional_tree_from_paths(paths):
    """Build a conditional FP-tree from the given prefix paths."""
    tree = FPTree()
    condition_item = None
    items = set()

    # Import the nodes in the paths into the new tree. Only the counts of the
    # leaf notes matter; the remaining counts will be reconstructed from the
    # leaf counts.
    for path in paths:
        if condition_item is None:
            condition_item = path[-1].item

        point = tree.root
        for node in path:
            next_point = point.search(node.item)
            if not next_point:
                # Add a new node to the tree.
                items.add(node.item)
                count = node.count if node.item == condition_item else 0
                next_point = FPNode(tree, node.item, count)
                point.add(next_point)
                tree._update_route(next_point)
            point = next_point

    assert condition_item is not None

    # Calculate the counts of the non-leaf nodes.
    for path in tree.prefix_paths(condition_item):
        count = path[-1].count
        for node in reversed(path[:-1]):
            node._count += count

    return tree

class FPNode(object):
    """A node in an FP tree."""

    def __init__(self, tree, item, count=1):
        self._tree = tree
        self._item = item
        self._count = count
        self._parent = None
        self._children = {}
        self._neighbor = None

    def add(self, child):
        """Add the given FPNode `child` as a child of this node."""

        if not isinstance(child, FPNode):
            raise TypeError("Can only add other FPNodes as children")

        if not child.item in self._children:
            self._children[child.item] = child
            child.parent = self

    def search(self, item):
        """
        Check whether this node contains a child node for the given item.
        If so, that node is returned; otherwise, `None` is returned.
        """
        try:
            return self._children[item]
        except KeyError:
            return None

    def __contains__(self, item):
        return item in self._children

    @property
    def tree(self):
        """The tree in which this node appears."""
        return self._tree

    @property
    def item(self):
        """The item contained in this node."""
        return self._item

    @property
    def count(self):
        """The count associated with this node's item."""
        return self._count

    def increment(self):
        """Increment the count associated with this node's item."""
        if self._count is None:
            raise ValueError("Root nodes have no associated count.")
        self._count += 1

    @property
    def root(self):
        """True if this node is the root of a tree; false if otherwise."""
        return self._item is None and self._count is None

    @property
    def leaf(self):
        """True if this node is a leaf in the tree; false if otherwise."""
        return len(self._children) == 0

    @property
    def parent(self):
        """The node's parent"""
        return self._parent

    @parent.setter
    def parent(self, value):
        if value is not None and not isinstance(value, FPNode):
            raise TypeError("A node must have an FPNode as a parent.")
        if value and value.tree is not self.tree:
            raise ValueError("Cannot have a parent from another tree.")
        self._parent = value

    @property
    def neighbor(self):
        """
        The node's neighbor; the one with the same value that is "to the right"
        of it in the tree.
        """
        return self._neighbor

    @neighbor.setter
    def neighbor(self, value):
        if value is not None and not isinstance(value, FPNode):
            raise TypeError("A node must have an FPNode as a neighbor.")
        if value and value.tree is not self.tree:
            raise ValueError("Cannot have a neighbor from another tree.")
        self._neighbor = value

    @property
    def children(self):
        """The nodes that are children of this node."""
        return tuple(self._children.itervalues())

    def inspect(self, depth=0):
        print(('  ' * depth) + repr(self))
        for child in self.children:
            child.inspect(depth + 1)

    def __repr__(self):
        if self.root:
            return "<%s (root)>" % type(self).__name__
        return "<%s %r (%r)>" % (type(self).__name__, self.item, self.count)


if __name__ == '__main__':
    from optparse import OptionParser
    import csv

    p = OptionParser(usage='%prog data_file')
    p.add_option('-s', '--minimum-support', dest='minsup', type='int',
        help='Minimum itemset support (default: 2)')
    p.add_option('-n', '--numeric', dest='numeric', action='store_true',
        help='Convert the values in datasets to numerals (default: false)')
    p.set_defaults(minsup=2)
    p.set_defaults(numeric=False)

    options, args = p.parse_args()
    if len(args) < 1:
        p.error('must provide the path to a CSV file to read')

    transactions = []
    with open(args[0]) as database:
        for row in csv.reader(database):
            if options.numeric:
                transaction = []
                for item in row:
                    transaction.append(long(item))
                transactions.append(transaction)
            else:
                transactions.append(row)

    result = []
    for itemset, support in find_frequent_itemsets(transactions, options.minsup, True):
        result.append((itemset, support))

    result = sorted(result, key=lambda i: i[0])
    for itemset, support in result:
        print(str(itemset) + ' ' + str(support))

以上兩個算法是從網上找到的,可以作爲我們這次數據挖掘的基礎算法。

3、data_analysis文件,主要是對數據進行一些基本的分析,將一些分類不在一個級別 上的數據進行規整,將一些不容易區分頁面信息的數據進行轉換。

#!/usr/bin/env python
# _*_ UTF-8 _*_
# 個人公衆號:livandata

import re
def open_big_data(path):
    with open(path) as f:
        for i in f:
            yield i

def data_check(sess_data):
    with open('pingan_pro', 'r') as f:
        data_c = f.read()
    check_data = data_c.split(',\n')
    for i in range(len(sess_data)):
        for j in range(len(sess_data[i])):
            if(sess_data[i][j]=='今日步數' or sess_data[i][j] == '免費領月卡' or sess_data[i][j] == '健康服務'):
                sess_data[i][j] = '我的健康'
            if (sess_data[i][j] == '購房貸' or sess_data[i][j] == '買傢俬' or sess_data[i][j] == '裝修超預算'):
                sess_data[i][j] = '房屋貸款'
          
            if ((re.search('消息中心', sess_data[i][j]) != None)):
                sess_data[i][j] = '消息中心'

            if ((re.search('信用卡', sess_data[i][j]) != None)
                or (re.search('信用額度', sess_data[i][j]) != None)
                or (re.search('臨額調整', sess_data[i][j]) != None)
                or (re.search('我的額度', sess_data[i][j]) != None)
                or (re.search('額度評估', sess_data[i][j]) != None)
                or (re.search('還款', sess_data[i][j]) != None)):
                sess_data[i][j] = '信用卡'

            if ((re.search('二維碼', sess_data[i][j]) != None)
                or (re.search('支付記錄', sess_data[i][j]) != None)):
                sess_data[i][j] = '收付款'

            if ((re.search('通訊錄', sess_data[i][j]) != None)):
                sess_data[i][j] = '通訊錄'

            if ((re.search('http:', sess_data[i][j]) != None)
                or (re.search('結束頁', sess_data[i][j]) != None)
                or (re.search('首頁', sess_data[i][j]) != None)
                or (re.search('購買', sess_data[i][j]) != None)
                or (re.search('申請記錄', sess_data[i][j]) != None)
                or (re.search('交易詳情頁', sess_data[i][j]) != None)):
                sess_data[i][j] = 'nan'

            for t in range(len(check_data)):
                if(re.search(check_data[t], sess_data[i][j])!=None):
                    sess_data[i][j] = check_data[t]

    for i in range(len(sess_data)):
        page_tmp = list(set(sess_data[i]))
        sess_data[i] = [i for i in page_tmp if i!='nan']

    return sess_data

4、refresh_data文件,主要是對分析的文件進行存儲,通過文件存取的方式實現分佈式處理:

#!/usr/bin/env python
# _*_ UTF-8 _*_
# 個人公衆號:livandata

import os
def write_result(items):
    with open('data_result.txt', 'a+') as f:
        for it in items:
            f.write(str(it)+':'+str(items[it])+'\n')

def read_result(items):
    data_res = {}
    data_res_2 = {}
    with open('data_result.txt', 'r+') as f:
        for data_tmp in f:
            datas_tmp = data_tmp.split('\n')
            datas = datas_tmp[0].split(':')
            for it in items:
                if(datas[0] == it):
                    datas_val = str(int(datas[1])+int(items[it]))
                    data_res[it] = datas_val
                    data_res_2[data_tmp] = dat+'\n'

    datass_res_list = [i for i in data_res]
    items_list = [j for j in items]
    res = list(set(items_list).difference(set(datass_res_list)))
    datass_={}
    for i in res:
        if(i in list(items.keys())):
            datass_[i] = items[i]
    return data_res_2, datass_

def refresh_data(items):
    if(os.path.exists('data_result.txt')):
        datas_res, datas_new = read_result(items)
        print(datas_res)
        with open('data_result.txt', 'a+') as f:
            for i in datas_new:
                f.write(i+':'+str(datas_new[i])+'\n')
        datas_res_li = [i for i in datas_res]
        with open('data_result.txt', 'r+') as f:
            for j in f:
                if(j not in datas_res_li):
                    with open('data_result2.txt', 'a+') as f2:
                        f2.write(j)
                else:
                    with open('data_result2.txt', 'a+') as f2:
                        f2.write(datas_res[j])
        os.remove('data_result.txt')
        os.rename('data_result2.txt', 'data_result.txt')
    else:
        write_result(items)

5、pro文件:即將一些頁面進行轉換所需要的材料庫:

96搜索,
借錢,
口袋社區,
領券中心,
猜金價,
種搖錢樹,
車主貸,
宅易通,

6、run文件,主要是運行文件的過程:

#!/usr/bin/env python
# _*_ UTF-8 _*_
# 個人公衆號:livandata

import pandas as pd
import data_analysis as das
import Fp_growth as fpg
import refresh_data as rfd

path='..\data\sub_customer.csv'
loop = True
chunkSize = 10
chunks = []
reader = pd.read_csv(path, iterator=True, dtype=str)
while loop:
    try:
        chunk = reader.get_chunk(chunkSize).fillna('nan')
        data = chunk[chunk['page_name']!='nan']['page_name'].reset_index()
        page_names = []
        for i in range(len(data['page_name'])):
            names = data['page_name'][i].split('"')
            page_name = [j for j in names if(j!='[' and j!=']' and j!=',')]
            page_names.append(page_name)
        page_names = das.data_check(page_name)
        page_names = [i for i in page_names if i!=[]]

        frequent_itemsets = fpg.find_frequent_itemsets(page_names, minimum_support=1,
                                                       include_support=True)
        result = []
        for itemset, support in frequent_itemsets:
            result.append(itemset, support)
        items = {}
        n = 5
        minSup = 0.6
        for itemset, support in result:
            keys = str(itemset)
            values = str(support)
            if(float(values)/n >= minSup):
                items[keys] = values

        rfd.refresh_data(items)
    except StopIteration:
        loop = False
        print('Iteration is stopped')

以上是算法運行的全過程,融合了僞分佈式處理,主要是參考了hadoop的處理方式。

由於關聯規則算法本身較爲耗時,爲節省時間往往需要使用多線程的方法,本文對應的調整了代碼,同時省去了代碼中較爲費時的部分:

#!/usr/bin/env python
# _*_ UTF-8 _*_
# 個人公衆號:livandata

import pandas as pd
import data_analysis as das
import Fp_growth as fpg
from Multiprocessing import Pool
import time
import os

def data_process(chunk, idx):
    data = chunk[chunk['page_name'] != 'nan']['page_name'].reset_index()
    page_names = []
    for i in range(len(data['page_name'])):
        names = data['page_name'][i].split('"')
        page_name = [j for j in names if (j != '[' and j != ']' and j != ',')]
        page_names.append(page_name)
    page_names = das.data_check(page_name)
    page_names = [i for i in page_names if i != []]

    frequent_itemsets = fpg.find_frequent_itemsets(page_names, minimum_support=1,
                                                   include_support=True)
    with open('data_result/data_result_%s.txt' % idx, 'w+') as f:
        for it in frequent_itemsets:
            if(len(it[0])>=2):
                f.write(str(it[0])+':'+str(it[1])+'\n')

def reduce_data(path):
    time_tmp = time.localtime(time.time())
    dates = str(time_tmp.tm_year)+str(time_tmp.tm_mon)+str(time_tmp.tm_mday)
    pathdir = os.listdir(path)
    result_data = {}
    for dir in pathdir:
        dir_t = path+'/'+dir
        if(os.path.isfile(dir_t)):
            with open(dir_t, 'r+') as f:
                data = f.read()
                data = data.split('\n')
                for da in data:
                    das = da.split(':')
                    if(len(das)>1):
                        if(das[0] in list(result_data.keys())):
                            result_data[das[0]] = result_data[das[0]]+int(das[1])
                        else:
                            result_data[das[0]] = int(das[1])

    with open('data_result.txt', 'w+') as f:
        for it in result_data:
            f.write(dates+','+str(it)+','+str(result_data[it])+'\n')

def main():
    path = '..\data\sub_customer.csv'
    loop = True
    chunkSize = 1000000
    reader = pd.read_csv(path, iterator=True, dtype=str)
    idx = 0
    ps = Pool(8)
    while loop:
        try:
            chunk = reader.get_chunk(chunkSize).fillna('nan')
            ps.apply_async(data_process, args=(chunk, idx,))
            idx = idx + 1
        except StopIteration:
            loop = False
            print('Iteration is stopped')
    ps.close()
    ps.join()
    
    path = 'data_result'
    reduce_data(path)
    
if __name__ == '__main__':
    main()

對應的代碼爲:https://download.csdn.net/download/livan1234/11238216

發表評論
所有評論
還沒有人評論,想成為第一個評論的人麼? 請在上方評論欄輸入並且點擊發布.
相關文章