Cloudera CDH/CDP 및 Hadoop EcoSystem, Semantic IoT등의 개발/운영 기술을 정리합니다. gooper@gooper.com로 문의 주세요.
1. 테스트용 triple rdf 파일인 test.ttl파일을 준비한다.
2. HDFS에 저장한다 : hadoop fs -put test.ttl
3. spark-submit을 이용하여 Spark Application을 실행한다.
: $HOME/spark/bin/spark-submit --master spark://sda1:7077,sda2:7077 --deploy-mode client --class ElephasTestBySpark --jars icbms-assembly-2.0.jar icbms-assembly-2.0.jar test.ttl
---------build.sbt(일부분) -----------
//elephas
("org.apache.jena" % "jena-elephas-common" % "3.1.0"),
("org.apache.jena" % "jena-elephas-io" % "3.1.0"),
("org.apache.jena" % "jena-elephas-mapreduce" % "3.1.0"),
// hadoop
("org.apache.hadoop" % "hadoop-common" % "2.7.2" % "provided"),
("org.apache.hadoop" % "hadoop-mapreduce-client-common" % "2.7.2" % "provided")
-------------------------ElephasTestBySpark.scala---------------------
import org.apache.spark.SparkConf
import org.apache.spark.SparkContext
import org.apache.jena.hadoop.rdf.io.input.TriplesInputFormat
import org.apache.hadoop.io.LongWritable
import org.apache.jena.hadoop.rdf.types.TripleWritable
import org.apache.hadoop.conf.Configuration
object ElephasTestBySpark{
def main(args: Array[String]) = {
val conf = new SparkConf()
.setAppName("ElephasTestBySpark")
val sc = new SparkContext(conf)
val hadoopConf = new Configuration()
val rdfTriples = sc.newAPIHadoopFile(args(0).toString(),
classOf[TriplesInputFormat],
classOf[LongWritable],
classOf[TripleWritable],
hadoopConf)
System.out.println("take 10 start-------------------");
rdfTriples.take(10).foreach(println)
System.out.println("take 10 end-------------------");
System.out.println("countByKey start-------------------");
rdfTriples.countByKey().foreach(println)
System.out.println("countByKey end-------------------");
}
}