JanusGraph批量導入數據代碼總結

說明

本文中的代碼基於janusgraph 0.3.1進行演示。數據文件都爲janusgraph包中自帶的數據文件。

1. Json導入到本地TinkerGraph

1.1 配置

conf/hadoop-graph/hadoop-load-json.properties 配置如下:

#
# Hadoop Graph Configuration
#
gremlin.graph=org.apache.tinkerpop.gremlin.hadoop.structure.HadoopGraph
gremlin.hadoop.graphReader=org.apache.tinkerpop.gremlin.hadoop.structure.io.graphson.GraphSONInputFormat
gremlin.hadoop.graphWriter=org.apache.hadoop.mapreduce.lib.output.NullOutputFormat
gremlin.hadoop.inputLocation=./data/grateful-dead.json
gremlin.hadoop.outputLocation=output
gremlin.hadoop.jarsInDistributedCache=true


#
# SparkGraphComputer Configuration
#
spark.master=local[*]
spark.executor.memory=1g
spark.serializer=org.apache.spark.serializer.KryoSerializer
spark.kryo.registrator=org.apache.tinkerpop.gremlin.spark.structure.io.gryo.GryoRegistrator

1.2 樣例Json

{"id":1,"label":"song","inE":{"followedBy":[{"id":3059,"outV":153,"properties":{"weight":1}},{"id":276,"outV":5,"properties":{"weight":2}},{"id":3704,"outV":3,"properties":{"weight":2}},{"id":4383,"outV":62,"pr
operties":{"weight":1}}]},"outE":{"followedBy":[{"id":0,"inV":2,"properties":{"weight":1}},{"id":1,"inV":3,"properties":{"weight":2}},{"id":2,"inV":4,"properties":{"weight":1}},{"id":3,"inV":5,"properties":{"we
ight":1}},{"id":4,"inV":6,"properties":{"weight":1}}],"sungBy":[{"id":7612,"inV":340}],"writtenBy":[{"id":7611,"inV":527}]},"properties":{"name":[{"id":0,"value":"HEY BO DIDDLEY"}],"songType":[{"id":2,"value":"
cover"}],"performances":[{"id":1,"value":5}]}}
{"id":2,"label":"song","inE":{"followedBy":[{"id":0,"outV":1,"properties":{"weight":1}},{"id":323,"outV":34,"properties":{"weight":1}}]},"outE":{"followedBy":[{"id":6190,"inV":123,"properties":{"weight":1}},{"i
d":6191,"inV":50,"properties":{"weight":1}}],"sungBy":[{"id":7666,"inV":525}],"writtenBy":[{"id":7665,"inV":525}]},"properties":{"name":[{"id":3,"value":"IM A MAN"}],"songType":[{"id":5,"value":"cover"}],"perfo
rmances":[{"id":4,"value":1}]}}
s

1.3 代碼

readGraph = GraphFactory.open('conf/hadoop-graph/hadoop-load-json.properties')
writeGraphConf = new BaseConfiguration()
writeGraphConf.setProperty("gremlin.graph", "org.apache.tinkerpop.gremlin.tinkergraph.structure.TinkerGraph")
writeGraphConf.setProperty("gremlin.tinkergraph.graphFormat", "gryo")
writeGraphConf.setProperty("gremlin.tinkergraph.graphLocation", "/tmp/csv-graph.kryo")
blvp = BulkLoaderVertexProgram.build().bulkLoader(OneTimeBulkLoader).writeGraph(writeGraphConf).create(readGraph)
readGraph.compute(SparkGraphComputer).workers(1).program(blvp).submit().get()


1.4 文件校驗

新生成的文件如下

[root@vm03 data]# ls -l /tmp/csv-graph.kryo 
-rw-r--r--. 1 root root 726353 May 29 04:09 /tmp/csv-graph.kryo

2. CSV導入到本地TinkerGraph

2.1 配置

conf/hadoop-graph/hadoop-load-csv.properties 配置如下:

#
# Hadoop Graph Configuration
#
gremlin.graph=org.apache.tinkerpop.gremlin.hadoop.structure.HadoopGraph
gremlin.hadoop.graphReader=org.apache.tinkerpop.gremlin.hadoop.structure.io.script.ScriptInputFormat
gremlin.hadoop.graphWriter=org.apache.tinkerpop.gremlin.hadoop.structure.io.graphson.GraphSONOutputFormat
gremlin.hadoop.inputLocation=./data/grateful-dead.txt
gremlin.hadoop.outputLocation=output
gremlin.hadoop.jarsInDistributedCache=true
gremlin.hadoop.scriptInputFormat.script=./data/script-input-grateful-dead.groovy

#
# SparkGraphComputer Configuration
#
spark.master=local[*]
spark.executor.memory=1g
spark.serializer=org.apache.spark.serializer.KryoSerializer
spark.kryo.registrator=org.apache.tinkerpop.gremlin.spark.structure.io.gryo.GryoRegistrator



2.2 樣例CSV

1,song,HEY BO DIDDLEY,cover,5   followedBy,2,1|followedBy,3,2|followedBy,4,1|followedBy,5,1|followedBy,6,1|sungBy,340|writtenBy,527     followedBy,3,2|followedBy,5,2|followedBy,62,1|followedBy,153,1
2,song,IM A MAN,cover,1 followedBy,50,1|followedBy,123,1|sungBy,525|writtenBy,525       followedBy,1,1|followedBy,34,1
3,song,NOT FADE AWAY,cover,531  followedBy,81,1|followedBy,86,5|followedBy,127,10|followedBy,59,1|followedBy,83,3|followedBy,103,2|followedBy,68,1|followedBy,134,2|followedBy,131,1|followedBy,151,1|followedBy,3

2.3 代碼

script-input-grateful-dead.groovy 代碼如下:

def parse(line) {
    def (vertex, outEdges, inEdges) = line.split(/\t/, 3)
    def (v1id, v1label, v1props) = vertex.split(/,/, 3)
    def v1 = graph.addVertex(T.id, v1id.toInteger(), T.label, v1label)
    switch (v1label) {
        case "song":
            def (name, songType, performances) = v1props.split(/,/)
            v1.property("name", name)
            v1.property("songType", songType)
            v1.property("performances", performances.toInteger())
            break
        case "artist":
            v1.property("name", v1props)
            break
        default:
            throw new Exception("Unexpected vertex label: ${v1label}")
    }
    [[outEdges, true], [inEdges, false]].each { def edges, def out ->
        edges.split(/\|/).grep().each { def edge ->
            def parts = edge.split(/,/)
            def otherV, eLabel, weight = null
            if (parts.size() == 2) {
                (eLabel, otherV) = parts
            } else {
                (eLabel, otherV, weight) = parts
            }
            def v2 = graph.addVertex(T.id, otherV.toInteger())
            def e = out ? v1.addOutEdge(eLabel, v2) : v1.addInEdge(eLabel, v2)
            if (weight != null) e.property("weight", weight.toInteger())
        }
    }
    return v1
}

janusgraph代碼:

readGraph = GraphFactory.open('conf/hadoop-graph/hadoop-load-csv.properties')
writeGraphConf = new BaseConfiguration()
writeGraphConf.setProperty("gremlin.graph", "org.apache.tinkerpop.gremlin.tinkergraph.structure.TinkerGraph")
writeGraphConf.setProperty("gremlin.tinkergraph.graphFormat", "gryo")
writeGraphConf.setProperty("gremlin.tinkergraph.graphLocation", "/tmp/csv-graph2.kryo")
blvp = BulkLoaderVertexProgram.build().bulkLoader(OneTimeBulkLoader).writeGraph(writeGraphConf).create(readGraph)
readGraph.compute(SparkGraphComputer).workers(1).program(blvp).submit().get()

g = GraphFactory.open(writeGraphConf).traversal()
g.V().valueMap(true)

2.4 文件校驗

新生成的文件如下

[root@vm03 data]# ls -l /tmp/csv-graph2.kryo 
-rw-r--r--. 1 root root 339939 May 29 04:56 /tmp/csv-graph2.kryo

3. Json導入到分佈式存儲(berkeleyje-es)

3.1 配置

conf/hadoop-graph/hadoop-load-json-ber-es.properties 配置如下:

#
# Hadoop Graph Configuration
#
gremlin.graph=org.apache.tinkerpop.gremlin.hadoop.structure.HadoopGraph
gremlin.hadoop.graphReader=org.apache.tinkerpop.gremlin.hadoop.structure.io.graphson.GraphSONInputFormat
gremlin.hadoop.graphWriter=org.apache.hadoop.mapreduce.lib.output.NullOutputFormat
gremlin.hadoop.inputLocation=./data/grateful-dead.json
gremlin.hadoop.outputLocation=output
gremlin.hadoop.jarsInDistributedCache=true


#
# SparkGraphComputer Configuration
#
spark.master=local[*]
spark.executor.memory=1g
spark.serializer=org.apache.spark.serializer.KryoSerializer
spark.kryo.registrator=org.apache.tinkerpop.gremlin.spark.structure.io.gryo.GryoRegistrator

./conf/janusgraph-berkeleyje-es-bulkload.properties 配置如下:

gremlin.graph=org.janusgraph.core.JanusGraphFactory
storage.backend=berkeleyje
storage.directory=../db/berkeley
index.search.backend=elasticsearch

3.2 樣例Json

{"id":1,"label":"song","inE":{"followedBy":[{"id":3059,"outV":153,"properties":{"weight":1}},{"id":276,"outV":5,"properties":{"weight":2}},{"id":3704,"outV":3,"properties":{"weight":2}},{"id":4383,"outV":62,"pr
operties":{"weight":1}}]},"outE":{"followedBy":[{"id":0,"inV":2,"properties":{"weight":1}},{"id":1,"inV":3,"properties":{"weight":2}},{"id":2,"inV":4,"properties":{"weight":1}},{"id":3,"inV":5,"properties":{"we
ight":1}},{"id":4,"inV":6,"properties":{"weight":1}}],"sungBy":[{"id":7612,"inV":340}],"writtenBy":[{"id":7611,"inV":527}]},"properties":{"name":[{"id":0,"value":"HEY BO DIDDLEY"}],"songType":[{"id":2,"value":"
cover"}],"performances":[{"id":1,"value":5}]}}
{"id":2,"label":"song","inE":{"followedBy":[{"id":0,"outV":1,"properties":{"weight":1}},{"id":323,"outV":34,"properties":{"weight":1}}]},"outE":{"followedBy":[{"id":6190,"inV":123,"properties":{"weight":1}},{"i
d":6191,"inV":50,"properties":{"weight":1}}],"sungBy":[{"id":7666,"inV":525}],"writtenBy":[{"id":7665,"inV":525}]},"properties":{"name":[{"id":3,"value":"IM A MAN"}],"songType":[{"id":5,"value":"cover"}],"perfo
rmances":[{"id":4,"value":1}]}}
s

3.3 代碼

outputGraphConfig = './conf/janusgraph-berkeleyje-es-bulkload.properties'
readGraph = GraphFactory.open('conf/hadoop-graph/hadoop-load-json-ber-es.properties')

blvp = BulkLoaderVertexProgram.build().writeGraph(outputGraphConfig).create(readGraph)
readGraph.compute(SparkGraphComputer).workers(1).program(blvp).submit().get()
g = GraphFactory.open(outputGraphConfig).traversal()
g.V().valueMap(true)

3.4 驗證

通過gremlin-server搭建服務進行驗證

  1. gremline-server配置文件如下(gremlin-server-berkeleyje-bulkload.yaml),與gremlin-server-berkeleyje.yaml類似,下面的位置進行調整:
graph: conf/janusgraph-berkeleyje-es-bulkload.properties

  1. ./gremlin-server.sh conf/gremlin-server/gremlin-server-berkeleyje-bulkload.yaml 啓動服務
  2. 通過graphexp進行查詢
發表評論
所有評論
還沒有人評論,想成為第一個評論的人麼? 請在上方評論欄輸入並且點擊發布.
相關文章