Laboratory for Web Algorihmics数据集格式转换
Gemini:step1+step2+step4+step5
WebGraph:step1+step2+step3
Gemini要求数据集以bin格式输入,而从LWA下载的数据用WebGraph压缩过,需要特定的解压方法才能满足不同系统的需求。
以下以clueweb12为例说明转换过程:
注:data_sbb为存放数据的文件夹名。
step1:配置WebGraph
注:配置WebGraph这一步,嫌麻烦可以直接从node145的/home/sbb/test01下拷贝lib文件夹到data_sbb目录下,注意要装jdk,不嫌烦就往下走
在data_sbb目录下载WebGraph和相应依赖包:
wget http://search.maven.org/remotecontent?filepath=it/unimi/dsi/webgraph/3.5.2/webgraph-3.5.2.jar
wget http://webgraph.di.unimi.it/webgraph-deps.tar.gz
在下载的同一目录下新建lib文件夹,将解压后的WebGraph和WebGraph-deps的jar包放在lib下:
mkdir lib
cd lib
cp
在data_sbb目录下测试是否安装成功:
java -cp "lib/*" it.unimi.dsi.webgraph.ArcListASCIIGraph --help
如果没装jdk,在这里看centos安装jdk的步骤
step2:从LWA下载数据集
下载.graph和.properties到data_sbb
wget http://data.law.di.unimi.it/webdata/clueweb12/clueweb12.graph
wget http://data.law.di.unimi.it/webdata/clueweb12/clueweb12.properties
step3:解压成邻接表格式
在data_sbb中运行
java -cp "lib/*" it.unimi.dsi.webgraph.ASCIIGraph clueweb12 clueweb12
得到的clueweb12-graph.txt即为ASCII码格式的邻接表文件
step4:解压成边表格式
在data_sbb中运行,这一步得到的结果会非常大,注意内存大小
java -cp "lib/*" it.unimi.dsi.webgraph.ArcListASCIIGraph clueweb12 clueweb12-edgelist.txt
得到的clueweb12-edgelist.txt即为ASCII码格式的边表文件
step5:转成二进制格式(压缩)
将convert2binary.cpp复制到data_sbb下,运行:
g++ convert2binary.cpp -o convert2binary
./convert2binary clueweb12-edgelist.txt clueweb12.bin
convert2binary.cpp
#include <iostream>
#include <vector>
#include <string>
#include <fstream>
#include <stdio.h>
#include <stdlib.h>
#include <unistd.h>
#include <sys/mman.h>
#include <sys/types.h>
#include <sys/stat.h>
#include <unistd.h>
#include <fcntl.h>
#include <string.h>
using namespace std;
typedef int ID;
typedef unsigned long long offset_t;
const offset_t BUFF_SIZE = 0x4000; //16K
enum Status{
OK = 1,
ERROR = 0
};
class OutFile
{
protected:
int fos;
offset_t wp;
char buffer[BUFF_SIZE];
public:
string dir, name, dir_name;
OutFile(string name, string dir):
dir(dir), name(name), dir_name(dir + "/" + name),wp(0){
fos = open((dir_name).c_str(),
O_WRONLY | O_CREAT | O_TRUNC,
S_IRUSR | S_IWUSR | S_IRGRP | S_IROTH);
}
Status write(const char* buff, size_t len)
{
//Fill the buffer
if(wp + len > BUFF_SIZE)
{
offset_t remain = BUFF_SIZE - wp;
memcpy(buffer + wp, buff, remain);
::write(fos, buffer, BUFF_SIZE);
wp = 0;
len -= remain;
buff += remain;
//write big chunks if any
if(len > BUFF_SIZE)
{
remain = len - (len / BUFF_SIZE) * BUFF_SIZE;
::write(fos, buff, len - remain);
buff += (len - remain);
len = remain;
}
}
memcpy(buffer, buff, len);
wp += len;
return OK;
}
Status flush()
{
if(::write(fos, buffer, wp) == -1)
return ERROR;
wp = 0;
return OK;
}
Status close()
{
Status s = flush();
if(s != OK) return s;
if(::close(fos) == -1)
{
cout << "close OutFile " << dir_name << " failed." << endl;
return ERROR;
}
return OK;
}
template<class T>
Status write_unit(T unit)
{
for(int i = 0; i < sizeof(T); i++)
{
if(wp >= BUFF_SIZE)
{
::write(fos, buffer, wp);
wp = 0;
}
char c = static_cast<unsigned char> (unit & 0xff);
buffer[wp++] = c;
unit >>= 8;
}
return OK;
}
};
void convert(char *fname, char *to_fname)
{
FILE *f = fopen(fname, "r");
OutFile binary_file(string(to_fname), string("."));
//Start read dataset
cout << "Start read dataset " << endl;
ID from;
long long maxlen = 10000000;
char *s = (char*)malloc(maxlen);
char delims[] = " \t\n";
long long edge_cnt = 0;
ID max_id = 0;
while (fgets(s, maxlen, f) != NULL)
{
if (s[0] == '#' || s[0] == '%' || s[0] == '\n')
continue; //Comment
char *t = strtok(s, delims);
from = atoi(t);
max_id = max(max_id, from);
ID to;
while ((t = strtok(NULL, delims)) != NULL)
{
to = atoi(t);
max_id = max(max_id, to);
binary_file.write_unit(from);
binary_file.write_unit(to);
edge_cnt++;
}
/*
if (from % 100000 == 0)
cout << from << endl;
//*/
///*
if (edge_cnt % 1000000 == 0)
cout << "Read edge num: " << edge_cnt << endl;
//*/
}
binary_file.close();
fclose(f);
free(s);
cout << "Edge number: " << edge_cnt << endl;
cout << "End read dataset max id: " << max_id << "vertices num: " max_id + 1 << endl;
}
int main(int argc, char *argv[])
{
if (argc != 3)
{
cout << "Usage ./convert2binary <dataset.txt> <binary edgelist file name>" << endl;
return 0;
}
convert(argv[1], argv[2]);
return 0;
}