/*
 * Copyright Alibaba Group Holding Ltd.
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *   http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package com.aliyun.lindorm.ldspark.examples;

import org.apache.spark.sql.Dataset;
import org.apache.spark.sql.Row;
import org.apache.spark.sql.SparkSession;

/**
 * This program demonstrates how to load data from MongoDB using ldspark.
 * We need go through the following steps:
 *
 * ============= Step1: Insert into data into MongoDB:
 * ```
 * sh.enableSharding("spark")
 * sh.shardCollection("spark.test", {_id: 'hashed'})
 * for (var i = 0; i < 4000; i++) {
 *   if ( i % 20 == 0 ) {
 *     db.test.insert({i: i, name: "name" + i, age: i / 20 });
 *   } else if ( i % 30 == 0 ) {
 *     db.test.insert({i: i, name: "name" + i, addr: "addr" + i, age: i / 20 });
 *   } else {
 *     db.test.insert({i: i, name: "name" + i});
 *   }
 * }
 * ```
 *
 * ============= Step2: Run this example at Lindorm console.
 * {
 *   "token": "TOKEN",
 *   "appName": "APP_NAME",
 *   "username": "USERNAME",
 *   "password": "PWD",
 *   "mainResource": "hdfs:///path/to/your/jar",
 *   "mainClass": "com.aliyun.lindorm.ldspark.examples.ReadMongoDBTables",
 *   "configs": {
 *     "spark.jars": "hdfs:///ldps-user-resource/ldps-bson-5.1.4.jar,hdfs:///ldps-user-resource/ldps-bson-record-codec-5.1.4.jar,hdfs:///ldps-user-resource/ldps-mongo-spark-connector_2.12-10.4.0.jar,hdfs:///ldps-user-resource/ldps-mongodb-driver-core-5.1.4.jar,hdfs:///ldps-user-resource/ldps-mongodb-driver-sync-5.1.4.jar"
 *   },
 *   "args": ["mongodb://root:XXXX@s-xxx1.mongodb.rds.aliyuncs.com:3717,s-xxx2.mongodb.rds.aliyuncs.com:3717/admin"]
 * }
 *
 * This program depends on 2 dependencies:
 *   1) org.apache.spark:spark-sql_2.12:3.3.1:provided
 *   2) org.mongodb.spark:mongo-spark-connector_2.12:10.4.0
 * The parameters in the config json should be obtained as follows:
 * 1. Param - mainResource: 1) Run `mvn clean package`; 2) upload the jar to hdfs.
 * 2. Param - spark.jars: 1) Run `mvn dependency:copy-dependencies -DoutputDirectory=dependencies -DexcludeScope=provided` ; 2) upload the jars to hdfs.
 * 3. Param - args: The mongodb connection string.
 * <p>
 * ============= Related documents
 * 1. MongoDB Spark Connector: https://www.mongodb.com/zh-cn/docs/spark-connector/current/batch-mode/batch-read/
 */
public class ReadMongoDBTables {
  public static void main(String[] args) {
    String addr = args[0];
    // 1. Build SparkSession and specified MongoDB connection string.
    SparkSession spark = SparkSession.builder()
                                     .config("spark.mongodb.read.connection.uri", addr)
                                     .config("spark.mongodb.write.connection.uri", addr)
                                     .getOrCreate();

    // 2. Read data from MongoDB.
    Dataset<Row> dataFrame = spark.read()
                                  .format("mongodb")
                                  .option("database", "spark")
                                  .option("collection", "test")
                                  .load();
    /**
     * Result:
     * root
     *  |-- _id: string (nullable = true)
     *  |-- addr: string (nullable = true)
     *  |-- age: double (nullable = true)
     *  |-- i: integer (nullable = true)
     *  |-- name: string (nullable = true)
     */
    dataFrame.printSchema();
    // 3. Show data.
    dataFrame.show(100);
    // 4. Filter data.
    dataFrame.filter("i < 50").show(50);
    long count = dataFrame.filter("i > 50").count();
    /**
     * Result:
     * Count: 3949
     */
    System.out.println(String.format("Count: %d", count));

    // 5. Write data(loaded from mongodb) into Lindorm columnar.
    dataFrame.createOrReplaceTempView("mongodb_view");
    spark.sql("CREATE DATABASE IF NOT EXISTS lindorm_columnar.mongodb");
    spark.sql("DROP TABLE IF EXISTS lindorm_columnar.mongodb.tbl");
    spark.sql("CREATE TABLE IF NOT EXISTS lindorm_columnar.mongodb.tbl(" +
                "id int," +
                "name string," +
                "addr string," +
                "age int) " +
                "PARTITIONED BY (bucket(4, id))");
    spark.sql("INSERT INTO lindorm_columnar.mongodb.tbl " +
                "SELECT i, name, addr, age FROM mongodb_view WHERE i > 50");
    spark.sql("DESCRIBE EXTENDED lindorm_columnar.mongodb.tbl;").show(false);
    /**
     * Result:
     * +--------+
     * |count(1)|
     * +--------+
     * |3949    |
     * +--------+
     */
    spark.sql("SELECT count(*) FROM lindorm_columnar.mongodb.tbl ").show(false);
    /**
     * Result:
     * +---+-------+------+----+
     * |id |name   |addr  |age |
     * +---+-------+------+----+
     * |71 |name71 |null  |null|
     * |81 |name81 |null  |null|
     * |90 |name90 |addr90|4   |
     * |97 |name97 |null  |null|
     * |106|name106|null  |null|
     * |110|name110|null  |null|
     * +---+-------+------+----+
     */
    spark.sql("SELECT * FROM lindorm_columnar.mongodb.tbl ").show(false);
  }
}
