Laboratory for Web Algorihmics數據集格式轉換

Laboratory for Web Algorihmics數據集格式轉換

Gemini:step1+step2+step4+step5

WebGraph:step1+step2+step3


Gemini要求數據集以bin格式輸入,而從LWA下載的數據用WebGraph壓縮過,需要特定的解壓方法才能滿足不同系統的需求。

以下以clueweb12爲例說明轉換過程:
注:data_sbb爲存放數據的文件夾名。


step1:配置WebGraph

注:配置WebGraph這一步,嫌麻煩可以直接從node145的/home/sbb/test01下拷貝lib文件夾到data_sbb目錄下,注意要裝jdk,不嫌煩就往下走

在data_sbb目錄下載WebGraph和相應依賴包:

wget http://search.maven.org/remotecontent?filepath=it/unimi/dsi/webgraph/3.5.2/webgraph-3.5.2.jar
wget http://webgraph.di.unimi.it/webgraph-deps.tar.gz

在下載的同一目錄下新建lib文件夾,將解壓後的WebGraph和WebGraph-deps的jar包放在lib下:

mkdir lib
cd lib
cp 

在data_sbb目錄下測試是否安裝成功:

java -cp "lib/*" it.unimi.dsi.webgraph.ArcListASCIIGraph --help

如果沒裝jdk,在這裏看centos安裝jdk的步驟

step2:從LWA下載數據集

下載.graph和.properties到data_sbb

wget http://data.law.di.unimi.it/webdata/clueweb12/clueweb12.graph
wget http://data.law.di.unimi.it/webdata/clueweb12/clueweb12.properties

step3:解壓成鄰接表格式

在data_sbb中運行

java -cp "lib/*" it.unimi.dsi.webgraph.ASCIIGraph clueweb12 clueweb12

得到的clueweb12-graph.txt即爲ASCII碼格式的鄰接表文件

step4:解壓成邊表格式

在data_sbb中運行,這一步得到的結果會非常大,注意內存大小

java -cp "lib/*" it.unimi.dsi.webgraph.ArcListASCIIGraph clueweb12 clueweb12-edgelist.txt

得到的clueweb12-edgelist.txt即爲ASCII碼格式的邊表文件

step5:轉成二進制格式(壓縮)

將convert2binary.cpp複製到data_sbb下,運行:

g++ convert2binary.cpp -o convert2binary
./convert2binary clueweb12-edgelist.txt clueweb12.bin

convert2binary.cpp

#include <iostream>
#include <vector>
#include <string>
#include <fstream>

#include <stdio.h>
#include <stdlib.h>
#include <unistd.h>
#include <sys/mman.h>
#include <sys/types.h>
#include <sys/stat.h>
#include <unistd.h>
#include <fcntl.h>
#include <string.h>

using namespace std;

typedef int ID;
typedef unsigned long long offset_t;

const offset_t BUFF_SIZE = 0x4000; //16K

enum Status{
    OK = 1,
    ERROR = 0
};

class OutFile 
{
protected:
    int fos;
    offset_t wp;
    char buffer[BUFF_SIZE];
public:
    string dir, name, dir_name;

    OutFile(string name, string dir):
        dir(dir), name(name), dir_name(dir + "/" + name),wp(0){
        fos = open((dir_name).c_str(), 
                O_WRONLY | O_CREAT | O_TRUNC, 
                S_IRUSR | S_IWUSR | S_IRGRP | S_IROTH);
    }

    Status write(const char* buff, size_t len)
    {
        //Fill the buffer
        if(wp + len > BUFF_SIZE)
        {
            offset_t remain = BUFF_SIZE - wp;
            memcpy(buffer + wp, buff, remain);
            ::write(fos, buffer, BUFF_SIZE);
            wp = 0;
            len -= remain;
            buff += remain;

            //write big chunks if any   
            if(len > BUFF_SIZE)
            {
                remain = len - (len / BUFF_SIZE) * BUFF_SIZE;
                ::write(fos, buff, len - remain);
                buff += (len - remain);
                len = remain;
            }
        }
        memcpy(buffer, buff, len);
        wp += len;
        return OK;
    }

    Status flush()
    {
        if(::write(fos, buffer, wp) == -1)
            return ERROR;
        wp = 0;
        return OK;
    }

    Status close()
    {
        Status s = flush();
        if(s != OK) return s;
        if(::close(fos) == -1)
        {
            cout << "close OutFile " << dir_name << " failed." << endl;
            return ERROR;
        }
        return OK;
    }

    template<class T>
    Status write_unit(T unit)
    {
        for(int i = 0; i < sizeof(T); i++)
        {
            if(wp >= BUFF_SIZE)
            {   
                ::write(fos, buffer, wp);
                wp = 0;
            }
            char c = static_cast<unsigned char> (unit & 0xff);
            buffer[wp++] = c;
            unit >>= 8;
        }
        return OK;
    }

};


void convert(char *fname, char *to_fname)
{
    FILE *f = fopen(fname, "r");
    OutFile binary_file(string(to_fname), string("."));

    //Start read dataset
    cout << "Start read dataset " << endl;
    ID from;
    long long maxlen = 10000000;
    char *s = (char*)malloc(maxlen);
    char delims[] = " \t\n";

    long long edge_cnt = 0;
    ID max_id = 0;

    while (fgets(s, maxlen, f) != NULL)
    {
        if (s[0] == '#' || s[0] == '%' || s[0] == '\n')
            continue; //Comment

        char *t = strtok(s, delims);
        from = atoi(t);
        max_id = max(max_id, from);

        ID to;
        while ((t = strtok(NULL, delims)) != NULL)
        {
            to = atoi(t);
            max_id = max(max_id, to);

            binary_file.write_unit(from);
            binary_file.write_unit(to);

            edge_cnt++;    
        }
        /*
        if (from % 100000 == 0)
            cout << from << endl;
        //*/
        ///*
        if (edge_cnt % 1000000 == 0)
        cout << "Read edge num: " << edge_cnt << endl;
        //*/
    }

    binary_file.close();

    fclose(f);
    free(s);
    cout << "Edge number: " << edge_cnt << endl;    
    cout << "End read dataset max id: " << max_id << "vertices num: " max_id + 1 << endl;
}


int main(int argc, char *argv[])
{
    if (argc != 3)
    {
        cout << "Usage ./convert2binary <dataset.txt> <binary edgelist file name>" << endl;
        return 0;
    }

    convert(argv[1], argv[2]);

    return 0;
}

發表評論
所有評論
還沒有人評論,想成為第一個評論的人麼? 請在上方評論欄輸入並且點擊發布.
相關文章