/*
 * Copyright Alibaba Group Holding Ltd.
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *   http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

package com.aliyun.lindorm.ldspark.examples;

import com.aliyun.oss.OSS;
import com.aliyun.oss.OSSClientBuilder;
import com.aliyun.oss.model.GetObjectRequest;
import com.aliyun.oss.model.OSSObjectSummary;
import com.aliyun.oss.model.ObjectListing;
import org.apache.spark.api.java.JavaRDD;
import org.apache.spark.api.java.JavaSparkContext;
import org.apache.spark.api.java.function.FlatMapFunction;
import org.apache.spark.sql.Row;
import org.apache.spark.sql.RowFactory;
import org.apache.spark.sql.SparkSession;
import org.apache.spark.sql.catalyst.analysis.NoSuchTableException;
import org.apache.spark.sql.types.DataTypes;
import org.apache.spark.sql.types.StructField;
import org.apache.spark.sql.types.StructType;

import java.io.BufferedReader;
import java.io.File;
import java.io.FileReader;
import java.util.ArrayList;
import java.util.Iterator;
import java.util.List;

/**
 * This example demonstrates how to read from OSS file and write the content into a Lindorm table:
 * 1. Set up a OSS-client;
 * 2. Set up a SparkContext;
 * 3. List all file paths by OSS-Client;
 * 4. Separate OSS file paths into a number of Spark partitions by SparkContext;
 * 5. Each Spark partition download a bunch of OSS files by a Spark task;
 * 6. After downloading and parsing, OSS data-set is mapped as a RDD;
 * 7. RDD is mapped as a Spark DataFrame, which you can manipulate as a common SQL table.
 */
public class DownloadOSSDataAndWriteToLindorm {
    public static void main(String[] args) throws NoSuchTableException {

        // When using OSS SDK, please refer to
        // https://help.aliyun.com/zh/oss/developer-reference/download-objects-as-files-6
        // Define OSS bucket name the data files belong to
        String ossBucketName = "examplebucket";
        // Define OSS directory the data files belong to
        String ossDir = "dir/in/bucket";
        // Define the local directory that the data files will be downloaded
        String localDir = "/local/dir/for/download"; // e.g. /opt/spark/work-dir/
        // Define the table in lindorm_table catalog will be written to, format "catalog.db.table"
        String targetTable = "lindorm_table.your_db.your_table";

        // When submitting spark job, please refer to
        // https://help.aliyun.com/document_detail/354202.html
        // to configure the following parameters:
        // * spark.hadoop.fs.oss.endpoint
        // * spark.hadoop.fs.oss.accessKeyId
        // * spark.hadoop.fs.oss.accessKeySecret
        // * spark.sql.catalog.lindorm_table.username
        // * spark.sql.catalog.lindorm_table.password

        // Initialize SparkSession & JavaSparkContext
        SparkSession spark = SparkSession.builder()
                .appName("ReadFileFromOSS")
                .getOrCreate();

        String accessKeyId = spark.conf().get("spark.hadoop.fs.oss.accessKeyId");
        String accessKeySecret = spark.conf().get("spark.hadoop.fs.oss.accessKeySecret");
        String endpoint = spark.conf().get("spark.hadoop.fs.oss.endpoint");

        try (JavaSparkContext jsc = new JavaSparkContext(spark.sparkContext())) {
            // Retrieve all file paths in ossDir through OSS client
            List<String> filePaths = new ArrayList<>();
            OSS ossClient = new OSSClientBuilder().build(endpoint, accessKeyId, accessKeySecret);
            try {
                ObjectListing objectListing = ossClient.listObjects(ossBucketName, ossDir);
                List<OSSObjectSummary> sums = objectListing.getObjectSummaries();
                for (int i = 0; i < sums.size(); i++) {
                    filePaths.add(sums.get(i).getKey());
                }
            } finally {
                ossClient.shutdown();
            }

            // Download the OSS file to the local in parallel through spark and load and convert it into row
            JavaRDD<Row> rowRDD =
                    jsc.parallelize(filePaths).mapPartitions(new FlatMapFunction<Iterator<String>, Row>() {
                        @Override
                        public Iterator<Row> call(Iterator<String> partition) throws Exception {
                            OSS ossClient = new OSSClientBuilder().build(endpoint, accessKeyId, accessKeySecret);
                            List<Row> rows = new ArrayList<>();
                            try {
                                while (partition.hasNext()) {
                                    String filePath = partition.next();
                                    String localPath = localDir + filePath.replace("/", "_");
                                    // Download the oss file to local
                                    ossClient.getObject(new GetObjectRequest(ossBucketName, filePath),
                                            new File(localPath));
                                    // load the file and convert it into row
                                    BufferedReader br = new BufferedReader(new FileReader(localPath));

                                    // !!! NOTICE !!!
                                    // You need to write your own file parsing logic here and convert to Row
                                    String line = br.readLine();
                                    while (line != null) {
                                        String[] parts = line.split(" ");
                                        rows.add(RowFactory.create(Integer.valueOf(parts[0]), parts[1]));
                                        line = br.readLine();
                                    }
                                }
                            } finally {
                                ossClient.shutdown();
                            }
                            return rows.iterator();
                        }
                    });

            // Define the schema of Row
            StructType schema = DataTypes.createStructType(new StructField[] {
                    DataTypes.createStructField("id", DataTypes.IntegerType, false),
                    DataTypes.createStructField("name", DataTypes.StringType, false)
            });
            // Transform RDD<Row> to DataFrame and write to lindorm_table calalog
            spark.createDataFrame(rowRDD, schema).writeTo(targetTable).append();
        }
    }
}
