An Example to retrieve only Math marks using DataFrame
sparkexamples > c20 > a20 > a20
sparkexamples > c20 > a20 > a20
cat c15_plain_text.input English|70 Math|80 Math|75
[raj@Rajkumars-MacBook-Pro ~/gitws/sparkexamples/src/main/scala/c20_SparkSQL/a20_case_class]$spark-shell --master local[*]
2016-05-29 14:17:43.005 java[35420:7318795] Unable to load realm info from SCDynamicStore
Welcome to
____ __
/ __/__ ___ _____/ /__
_\ \/ _ \/ _ `/ __/ '_/
/___/ .__/\_,_/_/ /_/\_\ version 1.6.1
/_/
Using Scala version 2.10.5 (Java HotSpot(TM) 64-Bit Server VM, Java 1.7.0_45)
Type in expressions to have them evaluated.
Type :help for more information.
Spark context available as sc.
SQL context available as sqlContext.
scala> // Need 'case class' to register our data as Table
scala> case class Subject(name:String, mark:String)
defined class Subject
scala> val input = sc.textFile("file:///Users/raj/gitws/sparkexamples/src/main/scala/c20_SparkSQL/a20_case_class/c15_plain_text.input")
input: org.apache.spark.rdd.RDD[String] = file:///Users/raj/gitws/sparkexamples/src/main/scala/c20_SparkSQL/a20_case_class/c15_plain_text.input MapPartitionsRDD[63] at textFile at <console>:27
scala> input.take(10)
res52: Array[String] = Array(English|70, Math|80, Math|75, Science|80)
scala> val splittedRdd = input.map(_.split("\\|"))
splittedRdd: org.apache.spark.rdd.RDD[Array[String]] = MapPartitionsRDD[64] at map at <console>:29
scala> splittedRdd.take(10)
res53: Array[Array[String]] = Array(Array(English, 70), Array(Math, 80), Array(Math, 75), Array(Science, 80))
scala> val subjectRdd = splittedRdd.map(x => Subject(x(0), x(1)))
subjectRdd: org.apache.spark.rdd.RDD[Subject] = MapPartitionsRDD[65] at map at <console>:33
scala> subjectRdd.take(10)
res54: Array[Subject] = Array(Subject(English,70), Subject(Math,80), Subject(Math,75), Subject(Science,80))
scala> val subjectDf = subjectRdd.toDF
subjectDf: org.apache.spark.sql.DataFrame = [name: string, mark: string]
scala> subjectDf.take(10)
res55: Array[org.apache.spark.sql.Row] = Array([English,70], [Math,80], [Math,75], [Science,80])
scala> subjectDf.printSchema()
root
|-- name: string (nullable = true)
|-- mark: string (nullable = true)
scala> val selected = subjectDf.select(subjectDf("name"))
selected: org.apache.spark.sql.DataFrame = [name: string]
scala> selected.take(10)
res57: Array[org.apache.spark.sql.Row] = Array([English], [Math], [Math], [Science])
scala> subjectDf.registerTempTable("subjecttable")
scala> val mathDf = sqlContext.sql("select * from subjecttable where name='Math'")
mathDf: org.apache.spark.sql.DataFrame = [name: string, mark: string]
scala> mathDf.take(10)
res59: Array[org.apache.spark.sql.Row] = Array([Math,80], [Math,75])
No comments:
Post a Comment