Laboratory for Web Algorihmics数据集格式转换

Laboratory for Web Algorihmics数据集格式转换

Gemini:step1+step2+step4+step5

WebGraph:step1+step2+step3


Gemini要求数据集以bin格式输入,而从LWA下载的数据用WebGraph压缩过,需要特定的解压方法才能满足不同系统的需求。

以下以clueweb12为例说明转换过程:
注:data_sbb为存放数据的文件夹名。


step1:配置WebGraph

注:配置WebGraph这一步,嫌麻烦可以直接从node145的/home/sbb/test01下拷贝lib文件夹到data_sbb目录下,注意要装jdk,不嫌烦就往下走

在data_sbb目录下载WebGraph和相应依赖包:

wget http://search.maven.org/remotecontent?filepath=it/unimi/dsi/webgraph/3.5.2/webgraph-3.5.2.jar
wget http://webgraph.di.unimi.it/webgraph-deps.tar.gz

在下载的同一目录下新建lib文件夹,将解压后的WebGraph和WebGraph-deps的jar包放在lib下:

mkdir lib
cd lib
cp 

在data_sbb目录下测试是否安装成功:

java -cp "lib/*" it.unimi.dsi.webgraph.ArcListASCIIGraph --help

如果没装jdk,在这里看centos安装jdk的步骤

step2:从LWA下载数据集

下载.graph和.properties到data_sbb

wget http://data.law.di.unimi.it/webdata/clueweb12/clueweb12.graph
wget http://data.law.di.unimi.it/webdata/clueweb12/clueweb12.properties

step3:解压成邻接表格式

在data_sbb中运行

java -cp "lib/*" it.unimi.dsi.webgraph.ASCIIGraph clueweb12 clueweb12

得到的clueweb12-graph.txt即为ASCII码格式的邻接表文件

step4:解压成边表格式

在data_sbb中运行,这一步得到的结果会非常大,注意内存大小

java -cp "lib/*" it.unimi.dsi.webgraph.ArcListASCIIGraph clueweb12 clueweb12-edgelist.txt

得到的clueweb12-edgelist.txt即为ASCII码格式的边表文件

step5:转成二进制格式(压缩)

将convert2binary.cpp复制到data_sbb下,运行:

g++ convert2binary.cpp -o convert2binary
./convert2binary clueweb12-edgelist.txt clueweb12.bin

convert2binary.cpp

#include <iostream>
#include <vector>
#include <string>
#include <fstream>

#include <stdio.h>
#include <stdlib.h>
#include <unistd.h>
#include <sys/mman.h>
#include <sys/types.h>
#include <sys/stat.h>
#include <unistd.h>
#include <fcntl.h>
#include <string.h>

using namespace std;

typedef int ID;
typedef unsigned long long offset_t;

const offset_t BUFF_SIZE = 0x4000; //16K

enum Status{
    OK = 1,
    ERROR = 0
};

class OutFile 
{
protected:
    int fos;
    offset_t wp;
    char buffer[BUFF_SIZE];
public:
    string dir, name, dir_name;

    OutFile(string name, string dir):
        dir(dir), name(name), dir_name(dir + "/" + name),wp(0){
        fos = open((dir_name).c_str(), 
                O_WRONLY | O_CREAT | O_TRUNC, 
                S_IRUSR | S_IWUSR | S_IRGRP | S_IROTH);
    }

    Status write(const char* buff, size_t len)
    {
        //Fill the buffer
        if(wp + len > BUFF_SIZE)
        {
            offset_t remain = BUFF_SIZE - wp;
            memcpy(buffer + wp, buff, remain);
            ::write(fos, buffer, BUFF_SIZE);
            wp = 0;
            len -= remain;
            buff += remain;

            //write big chunks if any   
            if(len > BUFF_SIZE)
            {
                remain = len - (len / BUFF_SIZE) * BUFF_SIZE;
                ::write(fos, buff, len - remain);
                buff += (len - remain);
                len = remain;
            }
        }
        memcpy(buffer, buff, len);
        wp += len;
        return OK;
    }

    Status flush()
    {
        if(::write(fos, buffer, wp) == -1)
            return ERROR;
        wp = 0;
        return OK;
    }

    Status close()
    {
        Status s = flush();
        if(s != OK) return s;
        if(::close(fos) == -1)
        {
            cout << "close OutFile " << dir_name << " failed." << endl;
            return ERROR;
        }
        return OK;
    }

    template<class T>
    Status write_unit(T unit)
    {
        for(int i = 0; i < sizeof(T); i++)
        {
            if(wp >= BUFF_SIZE)
            {   
                ::write(fos, buffer, wp);
                wp = 0;
            }
            char c = static_cast<unsigned char> (unit & 0xff);
            buffer[wp++] = c;
            unit >>= 8;
        }
        return OK;
    }

};


void convert(char *fname, char *to_fname)
{
    FILE *f = fopen(fname, "r");
    OutFile binary_file(string(to_fname), string("."));

    //Start read dataset
    cout << "Start read dataset " << endl;
    ID from;
    long long maxlen = 10000000;
    char *s = (char*)malloc(maxlen);
    char delims[] = " \t\n";

    long long edge_cnt = 0;
    ID max_id = 0;

    while (fgets(s, maxlen, f) != NULL)
    {
        if (s[0] == '#' || s[0] == '%' || s[0] == '\n')
            continue; //Comment

        char *t = strtok(s, delims);
        from = atoi(t);
        max_id = max(max_id, from);

        ID to;
        while ((t = strtok(NULL, delims)) != NULL)
        {
            to = atoi(t);
            max_id = max(max_id, to);

            binary_file.write_unit(from);
            binary_file.write_unit(to);

            edge_cnt++;    
        }
        /*
        if (from % 100000 == 0)
            cout << from << endl;
        //*/
        ///*
        if (edge_cnt % 1000000 == 0)
        cout << "Read edge num: " << edge_cnt << endl;
        //*/
    }

    binary_file.close();

    fclose(f);
    free(s);
    cout << "Edge number: " << edge_cnt << endl;    
    cout << "End read dataset max id: " << max_id << "vertices num: " max_id + 1 << endl;
}


int main(int argc, char *argv[])
{
    if (argc != 3)
    {
        cout << "Usage ./convert2binary <dataset.txt> <binary edgelist file name>" << endl;
        return 0;
    }

    convert(argv[1], argv[2]);

    return 0;
}

發表評論
所有評論
還沒有人評論,想成為第一個評論的人麼? 請在上方評論欄輸入並且點擊發布.
相關文章