本文介绍了在Spark里使用OSS select来优化数据查询的方法。目前OSS select处于公测中。
进入jars目录,下载oss-select.jar依赖包
下载地址:
项目源码可参考开源项目minio-SparkSelect
spark.sparkContext.hadoopConfiguration.set("fs.s3a.access.key", "") spark.sparkContext.hadoopConfiguration.set("fs.s3a.secret.key", "") spark.sparkContext.hadoopConfiguration.set("fs.s3a.endpoint", "") spark.sparkContext.hadoopConfiguration.set("fs.s3a.impl", "org.apache.hadoop.fs.s3a.S3AFileSystem") val schema = StructType( List(StructField("name", StringType, false), StructField("age", IntegerType, false))) val df = spark.read.format("minioSelectCSV")//minioSelectJSON .schema(schema) .options(Map("quote" -> "\'", "header" -> "true", "delimiter" -> ",")) .load("s3://bukcet/object") df.select("*").show() sql: spark.sql("CREATE TEMPORARY VIEW MyView (age INT, name STRING) USING minioSelectCSV OPTIONS (path \"s3://bucket/object\")") spark.sql("select * from MyView where age > 10").show()