4 anos atrás · 37a794ec9b
--- a/dependency-reduced-pom.xml
+++ b/dependency-reduced-pom.xml
@@ -0,0 +1,62 @@
 
				+<?xml version="1.0" encoding="UTF-8"?>
			
 
				+<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/maven-v4_0_0.xsd">
			
 
				+  <modelVersion>4.0.0</modelVersion>
			
 
				+  <groupId>org.example</groupId>
			
 
				+  <artifactId>jiaoke_dataextract</artifactId>
			
 
				+  <version>1.0-SNAPSHOT</version>
			
 
				+  <build>
			
 
				+    <sourceDirectory>src/main/scala</sourceDirectory>
			
 
				+    <testSourceDirectory>src/test/scala</testSourceDirectory>
			
 
				+    <plugins>
			
 
				+      <plugin>
			
 
				+        <groupId>net.alchim31.maven</groupId>
			
 
				+        <artifactId>scala-maven-plugin</artifactId>
			
 
				+        <version>3.2.0</version>
			
 
				+        <executions>
			
 
				+          <execution>
			
 
				+            <goals>
			
 
				+              <goal>compile</goal>
			
 
				+              <goal>testCompile</goal>
			
 
				+            </goals>
			
 
				+            <configuration>
			
 
				+              <args>
			
 
				+                <arg>-dependencyfile</arg>
			
 
				+                <arg>${project.build.directory}/.scala_dependencies</arg>
			
 
				+              </args>
			
 
				+            </configuration>
			
 
				+          </execution>
			
 
				+        </executions>
			
 
				+      </plugin>
			
 
				+      <plugin>
			
 
				+        <artifactId>maven-shade-plugin</artifactId>
			
 
				+        <version>2.3</version>
			
 
				+        <executions>
			
 
				+          <execution>
			
 
				+            <phase>package</phase>
			
 
				+            <goals>
			
 
				+              <goal>shade</goal>
			
 
				+            </goals>
			
 
				+            <configuration>
			
 
				+              <filters>
			
 
				+                <filter>
			
 
				+                  <artifact>*:*</artifact>
			
 
				+                  <excludes>
			
 
				+                    <exclude>META-INF/*.SF</exclude>
			
 
				+                    <exclude>META-INF/*.DSA</exclude>
			
 
				+                    <exclude>META-INF/*.RSA</exclude>
			
 
				+                  </excludes>
			
 
				+                </filter>
			
 
				+              </filters>
			
 
				+              <transformers>
			
 
				+                <transformer>
			
 
				+                  <mainClass />
			
 
				+                </transformer>
			
 
				+              </transformers>
			
 
				+            </configuration>
			
 
				+          </execution>
			
 
				+        </executions>
			
 
				+      </plugin>
			
 
				+    </plugins>
			
 
				+  </build>
			
 
				+</project>
			
 
				+
			
--- a/pom.xml
+++ b/pom.xml
@@ -0,0 +1,137 @@
 
				+<?xml version="1.0" encoding="UTF-8"?>
			
 
				+<project xmlns="http://maven.apache.org/POM/4.0.0"
			
 
				+         xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
			
 
				+         xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
			
 
				+    <modelVersion>4.0.0</modelVersion>
			
 
				+
			
 
				+    <groupId>org.example</groupId>
			
 
				+    <artifactId>jiaoke_dataextract</artifactId>
			
 
				+    <version>1.0-SNAPSHOT</version>
			
 
				+<!--    <repositories>
			
 
				+        <repository>
			
 
				+            <id>cloudera-releases</id>
			
 
				+            <url>https://repository.cloudera.com/artifactory/cloudera-repos/</url>
			
 
				+            <releases>
			
 
				+                <enabled>true</enabled>
			
 
				+            </releases>
			
 
				+            <snapshots>
			
 
				+                <enabled>false</enabled>
			
 
				+            </snapshots>
			
 
				+        </repository>
			
 
				+
			
 
				+    </repositories>-->
			
 
				+    <dependencies>
			
 
				+        <dependency>
			
 
				+            <groupId>org.scala-lang</groupId>
			
 
				+            <artifactId>scala-library</artifactId>
			
 
				+            <version>2.11.8</version>
			
 
				+           <!-- <scope>provided</scope>-->
			
 
				+        </dependency>
			
 
				+        <!-- https://mvnrepository.com/artifact/org.apache.hadoop/hadoop-common -->
			
 
				+<!--        <dependency>
			
 
				+            <groupId>org.apache.hadoop</groupId>
			
 
				+            <artifactId>hadoop-common</artifactId>
			
 
				+            <version>3.0.0-cdh6.3.1</version>
			
 
				+        </dependency>
			
 
				+        <dependency>
			
 
				+            <groupId>org.apache.hadoop</groupId>
			
 
				+            <artifactId>hadoop-client</artifactId>
			
 
				+            <version>3.0.0-cdh6.3.1</version>
			
 
				+        </dependency>
			
 
				+        &lt;!&ndash; https://mvnrepository.com/artifact/org.apache.hive/hive-jdbc &ndash;&gt;
			
 
				+        <dependency>
			
 
				+            <groupId>org.apache.hive</groupId>
			
 
				+            <artifactId>hive-jdbc</artifactId>
			
 
				+            <version>2.1.1-cdh6.3.1</version>
			
 
				+        </dependency>-->
			
 
				+
			
 
				+        <!-- https://mvnrepository.com/artifact/org.apache.spark/spark-core -->
			
 
				+        <dependency>
			
 
				+            <groupId>org.apache.spark</groupId>
			
 
				+            <artifactId>spark-core_2.11</artifactId>
			
 
				+            <version>2.3.1</version>
			
 
				+        <!--    <scope>provided</scope>-->
			
 
				+        </dependency>
			
 
				+        <!-- https://mvnrepository.com/artifact/org.apache.spark/spark-core -->
			
 
				+        <dependency>
			
 
				+            <groupId>org.apache.spark</groupId>
			
 
				+            <artifactId>spark-sql_2.11</artifactId>
			
 
				+            <version>2.3.1</version>
			
 
				+           <!-- <scope>provided</scope>-->
			
 
				+        </dependency>
			
 
				+<!--
			
 
				+        &lt;!&ndash; https://mvnrepository.com/artifact/org.apache.spark/spark-core &ndash;&gt;
			
 
				+        <dependency>
			
 
				+            <groupId>org.apache.spark</groupId>
			
 
				+            <artifactId>spark-core_2.11</artifactId>
			
 
				+            <version>2.4.0-cdh6.3.1</version>
			
 
				+        </dependency>
			
 
				+        &lt;!&ndash; https://mvnrepository.com/artifact/org.apache.spark/spark-core &ndash;&gt;
			
 
				+        <dependency>
			
 
				+            <groupId>org.apache.spark</groupId>
			
 
				+            <artifactId>spark-sql_2.11</artifactId>
			
 
				+            <version>2.4.0-cdh6.3.1</version>
			
 
				+        </dependency>
			
 
				+-->
			
 
				+
			
 
				+
			
 
				+
			
 
				+    </dependencies>
			
 
				+
			
 
				+    <build>
			
 
				+        <sourceDirectory>src/main/scala</sourceDirectory>
			
 
				+        <testSourceDirectory>src/test/scala</testSourceDirectory>
			
 
				+        <plugins>
			
 
				+            <plugin>
			
 
				+                <groupId>net.alchim31.maven</groupId>
			
 
				+                <artifactId>scala-maven-plugin</artifactId>
			
 
				+                <version>3.2.0</version>
			
 
				+                <executions>
			
 
				+                    <execution>
			
 
				+                        <goals>
			
 
				+                            <goal>compile</goal>
			
 
				+                            <goal>testCompile</goal>
			
 
				+                        </goals>
			
 
				+                        <configuration>
			
 
				+                            <args>
			
 
				+                                <arg>-dependencyfile</arg>
			
 
				+                                <arg>${project.build.directory}/.scala_dependencies</arg>
			
 
				+                            </args>
			
 
				+                        </configuration>
			
 
				+                    </execution>
			
 
				+                </executions>
			
 
				+            </plugin>
			
 
				+            <plugin>
			
 
				+                <groupId>org.apache.maven.plugins</groupId>
			
 
				+                <artifactId>maven-shade-plugin</artifactId>
			
 
				+                <version>2.3</version>
			
 
				+                <executions>
			
 
				+                    <execution>
			
 
				+                        <phase>package</phase>
			
 
				+                        <goals>
			
 
				+                            <goal>shade</goal>
			
 
				+                        </goals>
			
 
				+                        <configuration>
			
 
				+                            <filters>
			
 
				+                                <filter>
			
 
				+                                    <artifact>*:*</artifact>
			
 
				+                                    <excludes>
			
 
				+                                        <exclude>META-INF/*.SF</exclude>
			
 
				+                                        <exclude>META-INF/*.DSA</exclude>
			
 
				+                                        <exclude>META-INF/*.RSA</exclude>
			
 
				+                                    </excludes>
			
 
				+                                </filter>
			
 
				+                            </filters>
			
 
				+                            <transformers>
			
 
				+                                <transformer implementation="org.apache.maven.plugins.shade.resource.ManifestResourceTransformer">
			
 
				+                                    <mainClass></mainClass>
			
 
				+                                </transformer>
			
 
				+                            </transformers>
			
 
				+                        </configuration>
			
 
				+                    </execution>
			
 
				+                </executions>
			
 
				+            </plugin>
			
 
				+        </plugins>
			
 
				+    </build>
			
 
				+
			
 
				+</project>
			
--- a/src/main/scala/com/aspirecn/jiaoke/data/extract/DataExtractByReadFile.scala
+++ b/src/main/scala/com/aspirecn/jiaoke/data/extract/DataExtractByReadFile.scala
@@ -0,0 +1,248 @@
 
				+package com.aspirecn.jiaoke.data.extract
			
 
				+
			
 
				+import java.nio.file.{Files, Paths}
			
 
				+import java.sql.Date
			
 
				+
			
 
				+import org.apache.log4j.{Level, Logger}
			
 
				+import org.apache.spark.sql.{DataFrame, Dataset, Row, SparkSession}
			
 
				+import org.apache.spark.storage.StorageLevel
			
 
				+
			
 
				+import scala.collection.immutable
			
 
				+
			
 
				+
			
 
				+/**
			
 
				+ * @auther: hwj
			
 
				+ * @date: 2021/4/13 10:39
			
 
				+ * @description:
			
 
				+ */
			
 
				+object DataExtractByReadFile {
			
 
				+  case class TrcExetcpu(vehicleid:String, exTime:Date, enTime:Date,feemileage :String, shortfeemileage:String,fee:String, enweight:String, extolllaneid:String, axlecount:String, vehicletype:String, day:String  )
			
 
				+  case class GBUPLoad(vehicleid:String, transtime:String, day:String)
			
 
				+  def main(args: Array[String]): Unit = {
			
 
				+    val log: Logger = Logger.getLogger("DataExtractByReadFile")
			
 
				+    log.setLevel(Level.INFO)
			
 
				+    if (args.size != 6){
			
 
				+      log.error(
			
 
				+        """
			
 
				+          |请正确传入程序参数：1.门架数据读取目录
			
 
				+          |                  2.ETC数据读取目录
			
 
				+          |                  3.现金出口数据读取目录
			
 
				+          |                  4.结果文件的输出路径（文件输出目录在执行不可存在，程序会自动进行创建，否则会报错）
			
 
				+          |                  5.checkpoint目录
			
 
				+          |                  6.结果文件输出个数，请根据数据量合理输入该参数，输出的文件越少，执行时间越长，输入0则为默认文件个数
			
 
				+          |""".stripMargin)
			
 
				+      System.exit(0)
			
 
				+    }
			
 
				+
			
 
				+   // 门架数据读取目录
			
 
				+    val menjiaPath: String = args(0)
			
 
				+    // etc数据读取目录
			
 
				+    val etcPath: String = args(1)
			
 
				+    // 现金出口数据读取目录
			
 
				+    val qitachukouPath: String = args(2)
			
 
				+    // 结果文件输出目录
			
 
				+    val out_path: String = args(3)
			
 
				+    // checkpoint目录
			
 
				+    val check_path: String = args(4)
			
 
				+    // 结果文件输出个数
			
 
				+    val result_partitions: Int = Integer.valueOf(args(5))
			
 
				+
			
 
				+
			
 
				+
			
 
				+
			
 
				+    // 当数据读取目录不存在时，结束程序
			
 
				+    if((!exists(menjiaPath))||(!exists(etcPath))||(!exists(qitachukouPath))){
			
 
				+      log.error("请输入正确的文件目录:" +
			
 
				+        "高架数据路径:"+menjiaPath +
			
 
				+        "etc数据路径:"+etcPath +
			
 
				+        "现金出口数据路径:"+qitachukouPath)
			
 
				+      System.exit(0)
			
 
				+    }
			
 
				+    // 当结果数据输出目录已存在时，结束程序
			
 
				+    if(exists(out_path)){
			
 
				+      log.error("数据输出目录已存在"+out_path)
			
 
				+      System.exit(0)
			
 
				+    }
			
 
				+    // 初始化SparkSession
			
 
				+    val spark: SparkSession = formatSparkSession
			
 
				+    //设置checkpoint
			
 
				+    spark.sparkContext.setCheckpointDir(check_path)
			
 
				+    // spark.sparkContext.setCheckpointDir("file:///data/checkpoint")
			
 
				+
			
 
				+    log.info("开始抽取数据")
			
 
				+    //todo：开始处理数据
			
 
				+    import org.apache.spark.sql.functions._
			
 
				+    // todo: 引入隐式转换
			
 
				+    import spark.implicits._
			
 
				+    // 高架数据
			
 
				+    log.info("开始抽取门架数据，抽取路径为："+menjiaPath)
			
 
				+    val gbuLoadDataset: Dataset[GBUPLoad] = spark.read.option("header", true).csv(menjiaPath).toDF("vehicleid", "transtime", "day").as[GBUPLoad].coalesce(36)
			
 
				+    log.info("抽取门架数据成功")
			
 
				+    //etc数据
			
 
				+    log.info("开始抽取etc出口数据，抽取路径为："+etcPath)
			
 
				+    val trcExetcpuDataset = spark.read.option("header", true).csv(etcPath).toDF("vehicleid","extime","entime","feemileage","shortfeemileage","fee","enweight","extolllaneid","axlecount","vehicletype","day").as[TrcExetcpu].coalesce(36)
			
 
				+    log.info("抽取etc出口数据成功")
			
 
				+    // 其他出口数据
			
 
				+    log.info("抽取开始抽取现金出口数据，抽取路径为："+qitachukouPath)
			
 
				+    val trcOtuDataset = spark.read.option("header", true).csv(qitachukouPath).toDF("vehicleid","extime","entime","feemileage","shortfeemileage","fee","enweight","extolllaneid","axlecount","vehicletype","day").as[TrcExetcpu].coalesce(36)
			
 
				+    log.info("现金出口数据成功")
			
 
				+
			
 
				+
			
 
				+
			
 
				+
			
 
				+    //合并etc出口和现金出口的数据
			
 
				+    log.info("开始合并etc出口和现金出口数据")
			
 
				+    val trcDataSet: Dataset[TrcExetcpu] = trcExetcpuDataset.unionByName(trcOtuDataset)
			
 
				+    log.info("合并etc出口和现金出口数据成功")
			
 
				+    // 创建视图 对出口数据进行处理
			
 
				+    trcDataSet.createOrReplaceTempView("trcDataSet")
			
 
				+    log.info("创建视图，开始统计出口相关数据")
			
 
				+    val trcDataSet_p: DataFrame = spark.sql(
			
 
				+      """
			
 
				+        |select
			
 
				+        |vehicleid,
			
 
				+        |exTime,
			
 
				+        |entime,
			
 
				+        |(case when entime is not null  then    unix_timestamp(regexp_replace(extime,'T',' '))-unix_timestamp(regexp_replace(entime,'T',' ')) else 0 end  ) as travel_time,
			
 
				+        |(case when feemileage>0 and feemileage is not null then feemileage else shortfeemileage   end) as feemileage,
			
 
				+        | cast(fee as double) as fee,
			
 
				+        |(case when enweight >0 then ( case when feemileage>0 and feemileage is not null then feemileage else shortfeemileage end)   else 0 end ) as weight_mileage,
			
 
				+        |substr(exTollLaneId,6,2) as travel_provinces,
			
 
				+        |axlecount,
			
 
				+        |vehicletype,
			
 
				+        |day
			
 
				+        |from
			
 
				+        |trcDataSet
			
 
				+        |
			
 
				+        |""".stripMargin)
			
 
				+    log.info("出口相关数据统计完成")
			
 
				+    // 取出出口表的 出口交易时间和入口交易时间，与门架表进行合并
			
 
				+    log.info("开始抽取出口数据中的出口交易时间")
			
 
				+    val ex_transtime: Dataset[GBUPLoad] = trcDataSet_p.select($"day", $"vehicleid".as("vehicleid"), $"exTime".as("transtime")).as[GBUPLoad]
			
 
				+    log.info("抽取出口数据中的出口交易时间成功")
			
 
				+    log.info("开始抽取出口数据中的入口交易时间")
			
 
				+    val en_transtime: Dataset[GBUPLoad] = trcDataSet_p.select($"day", $"vehicleid".as("vehicleid"),$"enTime".as("transtime")).as[GBUPLoad]
			
 
				+    log.info("抽取出口数据中的入口交易时间成功")
			
 
				+    log.info("开始合并门架数据时间与 入口交易时间、出口交易时间")
			
 
				+    val exetcpuDataset: Dataset[GBUPLoad] = gbuLoadDataset.unionByName(ex_transtime).unionByName(en_transtime)
			
 
				+    log.info("合并交易时间成功")
			
 
				+    trcDataSet_p.persist(StorageLevel.MEMORY_AND_DISK)
			
 
				+    trcDataSet_p.checkpoint()
			
 
				+    trcDataSet_p.unpersist()
			
 
				+    // 创建视图 对用户出行数据进行分段统计次数
			
 
				+    exetcpuDataset.createOrReplaceTempView("gbupload_etctu")
			
 
				+    val gbupload_etctu: Dataset[GBUPLoad] = spark.sql(
			
 
				+      """
			
 
				+        |select vehicleid as vehicleid,
			
 
				+        |(case
			
 
				+        |   when 0 <= hour(transtime) and hour(transtime)<=3 then 'A'
			
 
				+        |   when 3 < hour(transtime) and hour(transtime)<=6 then 'B'
			
 
				+        |   when 6 < hour(transtime) and hour(transtime)<=9 then 'C'
			
 
				+        |   when 9 < hour(transtime) and hour(transtime)<=12 then 'D'
			
 
				+        |   when 12 < hour(transtime) and hour(transtime)<=15 then 'E'
			
 
				+        |   when 15 < hour(transtime) and hour(transtime)<=18 then 'F'
			
 
				+        |   when 18 < hour(transtime) and hour(transtime)<=21 then 'G'
			
 
				+        |   when 21 < hour(transtime) and hour(transtime)<24 then 'H'
			
 
				+        |   else 'I' end) as transtime,
			
 
				+        |day
			
 
				+        |from
			
 
				+        |gbupload_etctu
			
 
				+        |""".stripMargin).as[GBUPLoad]
			
 
				+    gbupload_etctu.persist(StorageLevel.MEMORY_AND_DISK)
			
 
				+    gbupload_etctu.checkpoint()
			
 
				+    gbupload_etctu.unpersist()
			
 
				+    spark.sqlContext.dropTempTable("gbupload_etctu")
			
 
				+
			
 
				+    //todo：根据车牌号进行分组聚合，统计最近一次出行时间、出行总时长、最长出行时间、出行总里程、出行花费总金额、负重出行总里程、出行次数
			
 
				+    //合并etc出口和现金出口的数据
			
 
				+    val trcDataFrame =  trcDataSet_p
			
 
				+      .groupBy($"day", $"vehicleid")
			
 
				+      .agg(immutable.Map("exTime" -> "max", "travel_time" -> "sum", "travel_time" -> "max", "feemileage" -> "sum", "fee" -> "sum", "weight_mileage" -> "sum", "vehicleid" -> "count"))
			
 
				+      .withColumnRenamed("max(exTime)","max_exTime")
			
 
				+      .withColumnRenamed("max(travel_time)","max_travel_time")
			
 
				+      .withColumnRenamed("sum(travel_time)","sum_travel_time")
			
 
				+      .withColumnRenamed("sum(feemileage)","sum_feemileage")
			
 
				+      .withColumnRenamed("sum(fee)","sum_fee")
			
 
				+      .withColumnRenamed("sum(weight_mileage)","sum_weight_mileage")
			
 
				+      .withColumnRenamed("count(vehicleid)","exTimes_count")
			
 
				+    //todo： 取车辆轴数和收费类型
			
 
				+    trcDataSet_p.createOrReplaceTempView("trc_table")
			
 
				+    val vehicleTypeAxlecount: DataFrame = spark.sql(
			
 
				+      """
			
 
				+        |select
			
 
				+        |a.day as day,
			
 
				+        |a.vehicleid as vehicleid,
			
 
				+        |a.axlecount as axlecount,
			
 
				+        |a.vehicletype as vehicletype
			
 
				+        |from
			
 
				+        |(select day ,
			
 
				+        |vehicleid ,
			
 
				+        | axlecount ,
			
 
				+        | vehicletype ,
			
 
				+        | row_number() over(partition by day , vehicleid  order by exTime desc) as rr
			
 
				+        |from
			
 
				+        |trc_table) a where a.rr=1
			
 
				+        |""".stripMargin)
			
 
				+    // val trc_C_V_DataFrame: DataFrame = trcDataFrame.join(vehicleTypeAxlecount.select($"axlecount",$"vehicletype",$"day",$"vehicleid"), (trcDataFrame.col("vehicleid") === vehicleTypeAxlecount.col("vehicleid")).and(trcDataFrame.col("day") === vehicleTypeAxlecount.col("day")), "left")
			
 
				+    // 合并出口统计数据与用户轴数
			
 
				+    val trc_C_V_DataFrame: DataFrame = trcDataFrame.join(vehicleTypeAxlecount, Seq("day","vehicleid"), "left")
			
 
				+    //todo: 抽取出口数据集中的省份编码
			
 
				+    val trcProvinceDataSet: Dataset[Row] = trcDataSet_p.select($"day", $"vehicleid", $"travel_provinces").groupBy($"day", $"vehicleid", $"travel_provinces").count().withColumnRenamed("count","travel_provinces_count")
			
 
				+    trcProvinceDataSet.createOrReplaceTempView("trcProvince_table")
			
 
				+    val trcProvinceCountDataSet = spark.sql(
			
 
				+      """
			
 
				+        |select   day , vehicleid as vehicleid ,concat_ws("#",collect_list(concat_ws("%",travel_provinces,travel_provinces_count) )) as travel_provinces_count
			
 
				+        |from trcProvince_table
			
 
				+        |group by day , vehicleid
			
 
				+        |""".stripMargin)
			
 
				+    val trcTransProvinceCountDataFrame = trc_C_V_DataFrame.join(trcProvinceCountDataSet,
			
 
				+      Seq("day","vehicleid"),
			
 
				+      "outer")
			
 
				+
			
 
				+
			
 
				+    //统计每个牌照出行时间段的次数
			
 
				+    val transtimeDataFrame:DataFrame = gbupload_etctu.groupBy($"day",$"vehicleid",$"transtime")
			
 
				+      .agg(count("transtime").as("transtime_preferences_count"))
			
 
				+    // 汇总车辆各个时间段出行次数，变成了map结构
			
 
				+    transtimeDataFrame.createOrReplaceTempView("transtimeData")
			
 
				+    val transTimeCountMapDataFrame: DataFrame = spark.sql(
			
 
				+      """
			
 
				+        |select   day ,
			
 
				+        |vehicleid ,
			
 
				+        |concat_ws("#",collect_list(concat_ws("%",transtime,transtime_preferences_count) )) as transtime_count
			
 
				+        |from transtimeData
			
 
				+        |group by day , vehicleid
			
 
				+        |""".stripMargin)
			
 
				+    var trcResultDataFrame = trcTransProvinceCountDataFrame.join(transTimeCountMapDataFrame,
			
 
				+      Seq("day","vehicleid"),
			
 
				+      "outer")
			
 
				+    spark.sqlContext.dropTempTable("trcDataSet")
			
 
				+    spark.sqlContext.dropTempTable("trcProvince_table")
			
 
				+    spark.sqlContext.dropTempTable("transtimeData")
			
 
				+    spark.sqlContext.dropTempTable("gbupload_etctu")
			
 
				+    spark.sqlContext.dropTempTable("trc_table")
			
 
				+    log.info(trcResultDataFrame.printSchema())
			
 
				+    if (result_partitions> 0){
			
 
				+      trcResultDataFrame= trcResultDataFrame.repartition(result_partitions)
			
 
				+    }
			
 
				+    trcResultDataFrame.write.csv(out_path)
			
 
				+
			
 
				+    spark.stop()
			
 
				+
			
 
				+  }
			
 
				+  def formatSparkSession(): SparkSession ={
			
 
				+    // todo: 实例化SparkSession
			
 
				+    val spark: SparkSession = SparkSession
			
 
				+      .builder()
			
 
				+   //   .appName("DataExtract")
			
 
				+      //   .master("local[*]")
			
 
				+      .getOrCreate()
			
 
				+    spark
			
 
				+  }
			
 
				+
			
 
				+  def exists(path:String):Boolean={
			
 
				+    Files.exists(Paths.get(path))
			
 
				+  }
			
 
				+
			
 
				+
			
 
				+}
			
--- a/src/main/scala/com/aspirecn/jiaoke/data/extract/DataExtractMonth.scala
+++ b/src/main/scala/com/aspirecn/jiaoke/data/extract/DataExtractMonth.scala
@@ -0,0 +1,249 @@
 
				+package com.aspirecn.jiaoke.data.extract
			
 
				+
			
 
				+import java.nio.file.{Files, Paths}
			
 
				+import java.sql.Date
			
 
				+
			
 
				+import org.apache.log4j.{Level, Logger}
			
 
				+import org.apache.spark.sql.{DataFrame, Dataset, Row, SparkSession}
			
 
				+import org.apache.spark.storage.StorageLevel
			
 
				+
			
 
				+import scala.collection.immutable
			
 
				+
			
 
				+
			
 
				+/**
			
 
				+ * @auther: hwj
			
 
				+ * @date: 2021/4/13 10:39
			
 
				+ * @description:
			
 
				+ */
			
 
				+object DataExtractMonth {
			
 
				+  case class TrcExetcpu(vehicleid:String, exTime:Date, enTime:Date,feemileage :String, shortfeemileage:String,fee:String, enweight:String, extolllaneid:String, axlecount:String, vehicletype:String, day:String  )
			
 
				+  case class GBUPLoad(vehicleid:String, transtime:String, month:String)
			
 
				+  def main(args: Array[String]): Unit = {
			
 
				+    val log: Logger = Logger.getLogger("DataExtractMonth")
			
 
				+    log.setLevel(Level.INFO)
			
 
				+    if (args.size != 4){
			
 
				+      log.error(
			
 
				+        """
			
 
				+          |请正确传入程序参数：
			
 
				+          |                  1.ETC数据读取目录
			
 
				+          |                  2.结果文件的输出路径（文件输出目录在执行不可存在，程序会自动进行创建，否则会报错）
			
 
				+          |                  3.checkpoint目录
			
 
				+          |                  4.结果文件输出个数，请根据数据量合理输入该参数，输出的文件越少，执行时间越长，输入0则为默认文件个数
			
 
				+          |""".stripMargin)
			
 
				+      System.exit(0)
			
 
				+    }
			
 
				+
			
 
				+
			
 
				+    // etc数据读取目录
			
 
				+    val etcPath: String = args(0)
			
 
				+    // 结果文件输出目录
			
 
				+    val out_path: String = args(1)
			
 
				+    // checkpoint目录
			
 
				+    val check_path: String = args(2)
			
 
				+    // 结果文件输出个数
			
 
				+    val result_partitions: Int = Integer.valueOf(args(3))
			
 
				+
			
 
				+
			
 
				+
			
 
				+
			
 
				+/*    // 当数据读取目录不存在时，结束程序
			
 
				+    if(!exists(etcPath)){
			
 
				+      log.error("请输入正确的文件目录:" +
			
 
				+        "etc数据路径:"+etcPath)
			
 
				+      System.exit(0)
			
 
				+    }*/
			
 
				+    // 当结果数据输出目录已存在时，结束程序
			
 
				+    if(exists(out_path)){
			
 
				+      log.error("数据输出目录已存在"+out_path)
			
 
				+      System.exit(0)
			
 
				+    }
			
 
				+    // 初始化SparkSession
			
 
				+    val spark: SparkSession = formatSparkSession
			
 
				+    //设置checkpoint
			
 
				+   // spark.sparkContext.setCheckpointDir(check_path)
			
 
				+    // spark.sparkContext.setCheckpointDir("file:///data/checkpoint")
			
 
				+
			
 
				+    log.info("开始抽取数据")
			
 
				+    //todo：开始处理数据
			
 
				+    import org.apache.spark.sql.functions._
			
 
				+    // todo: 引入隐式转换
			
 
				+    import spark.implicits._
			
 
				+    //etc数据
			
 
				+    log.info("开始抽取etc出口数据，抽取路径为："+etcPath)
			
 
				+    val trcDataSet = spark.read.option("header", true).csv(etcPath).toDF("vehicleid","extime","entime","feemileage","shortfeemileage","fee","enweight","extolllaneid","axlecount","vehicletype","day").as[TrcExetcpu].coalesce(36)
			
 
				+    log.info("抽取etc出口数据成功")
			
 
				+
			
 
				+
			
 
				+
			
 
				+
			
 
				+    log.info("合并etc出口和现金出口数据成功")
			
 
				+    // 创建视图 对出口数据进行处理
			
 
				+    trcDataSet.withColumn("month",lit($"day".substr(0,6))).createOrReplaceTempView("trcDataSet")
			
 
				+    log.info("创建视图，开始统计出口相关数据")
			
 
				+    val trcDataSet_p: DataFrame = spark.sql(
			
 
				+      """
			
 
				+        |select
			
 
				+        |vehicleid,
			
 
				+        |exTime,
			
 
				+        |entime,
			
 
				+        |(case when entime is not null and entime<>'NULL'  then    unix_timestamp(regexp_replace(extime,'T',' '))-unix_timestamp(regexp_replace(entime,'T',' ')) else 0 end  ) as travel_time,
			
 
				+        |(case when feemileage>0 and feemileage is not null then feemileage else shortfeemileage   end) as feemileage,
			
 
				+        | cast(fee as double) as fee,
			
 
				+        |(case when enweight >0 then ( case when feemileage>0 and feemileage is not null then feemileage else shortfeemileage end)   else 0 end ) as weight_mileage,
			
 
				+        |substr(exTollLaneId,6,2) as travel_provinces,
			
 
				+        |(case when axlecount ='NULL' then 0 else axlecount end ) as axlecount,
			
 
				+        |vehicletype,
			
 
				+        |month
			
 
				+        |from
			
 
				+        |trcDataSet
			
 
				+        |""".stripMargin)
			
 
				+    log.info("出口相关数据统计完成")
			
 
				+    // 取出出口表的 出口交易时间和入口交易时间，与门架表进行合并
			
 
				+    log.info("开始抽取出口数据中的出口交易时间")
			
 
				+    val ex_transtime: Dataset[GBUPLoad] = trcDataSet_p.select($"month", $"vehicleid".as("vehicleid"), $"exTime".as("transtime")).as[GBUPLoad]
			
 
				+    log.info("抽取出口数据中的出口交易时间成功")
			
 
				+    log.info("开始抽取出口数据中的入口交易时间")
			
 
				+    val en_transtime: Dataset[GBUPLoad] = trcDataSet_p.select($"month", $"vehicleid".as("vehicleid"),$"enTime".as("transtime")).as[GBUPLoad]
			
 
				+    log.info("抽取出口数据中的入口交易时间成功")
			
 
				+    log.info("开始合并入口交易时间和出口交易时间")
			
 
				+    val exetcpuDataset: Dataset[GBUPLoad] = ex_transtime.unionByName(en_transtime)
			
 
				+    log.info("合并交易时间成功")
			
 
				+/*    trcDataSet_p.persist(StorageLevel.MEMORY_AND_DISK)
			
 
				+    trcDataSet_p.checkpoint()
			
 
				+    trcDataSet_p.unpersist()*/
			
 
				+    // 创建视图 对用户出行数据进行分段统计次数
			
 
				+    exetcpuDataset.createOrReplaceTempView("gbupload_etctu")
			
 
				+    val gbupload_etctu: Dataset[GBUPLoad] = spark.sql(
			
 
				+      """
			
 
				+        |select vehicleid as vehicleid,
			
 
				+        |(case
			
 
				+        |   when 0 <= hour(transtime) and hour(transtime)<=3 then 'A'
			
 
				+        |   when 3 < hour(transtime) and hour(transtime)<=6 then 'B'
			
 
				+        |   when 6 < hour(transtime) and hour(transtime)<=9 then 'C'
			
 
				+        |   when 9 < hour(transtime) and hour(transtime)<=12 then 'D'
			
 
				+        |   when 12 < hour(transtime) and hour(transtime)<=15 then 'E'
			
 
				+        |   when 15 < hour(transtime) and hour(transtime)<=18 then 'F'
			
 
				+        |   when 18 < hour(transtime) and hour(transtime)<=21 then 'G'
			
 
				+        |   when 21 < hour(transtime) and hour(transtime)<24 then 'H'
			
 
				+        |   else 'I' end) as transtime,
			
 
				+        |month
			
 
				+        |from
			
 
				+        |gbupload_etctu
			
 
				+        |""".stripMargin).as[GBUPLoad]
			
 
				+ /*   gbupload_etctu.persist(StorageLevel.MEMORY_AND_DISK)
			
 
				+    gbupload_etctu.checkpoint()
			
 
				+    gbupload_etctu.unpersist()*/
			
 
				+    spark.sqlContext.dropTempTable("gbupload_etctu")
			
 
				+
			
 
				+    //todo：根据车牌号进行分组聚合，统计最近一次出行时间、出行总时长、最长出行时间、出行总里程、出行花费总金额、负重出行总里程、出行次数
			
 
				+    //合并etc出口和现金出口的数据
			
 
				+    trcDataSet_p.createOrReplaceTempView("trc_table")
			
 
				+/*    val trcDataFrame =  trcDataSet_p
			
 
				+      .groupBy($"month", $"vehicleid")
			
 
				+      .agg(immutable.Map("exTime" -> "max", "travel_time" -> "sum", "travel_time" -> "max", "feemileage" -> "sum", "fee" -> "sum", "weight_mileage" -> "sum", "vehicleid" -> "count"))
			
 
				+      .withColumnRenamed("max(exTime)","max_exTime")
			
 
				+      .withColumnRenamed("max(travel_time)","max_travel_time")
			
 
				+      .withColumnRenamed("sum(travel_time)","sum_travel_time")
			
 
				+      .withColumnRenamed("sum(feemileage)","sum_feemileage")
			
 
				+      .withColumnRenamed("sum(fee)","sum_fee")
			
 
				+      .withColumnRenamed("sum(weight_mileage)","sum_weight_mileage")
			
 
				+      .withColumnRenamed("count(vehicleid)","exTimes_count")*/
			
 
				+        val trcDataFrame = spark.sql(
			
 
				+          """
			
 
				+            |select
			
 
				+            |month,
			
 
				+            |vehicleid,
			
 
				+            |max(exTime) as max_exTime,
			
 
				+            |sum(travel_time) as sum_travel_time,
			
 
				+            |max(travel_time) as max_travel_time,
			
 
				+            |sum(feemileage) as sum_feemileage,
			
 
				+            |sum(fee) as sum_fee,
			
 
				+            |sum(weight_mileage) as sum_weight_mileage,
			
 
				+            |count(vehicleid) as exTimes_count
			
 
				+            | from
			
 
				+            | trc_table
			
 
				+            | group by month, vehicleid
			
 
				+            |""".stripMargin)
			
 
				+
			
 
				+
			
 
				+
			
 
				+    //todo： 取车辆轴数和收费类型
			
 
				+    val vehicleTypeAxlecount: DataFrame = spark.sql(
			
 
				+      """
			
 
				+        |select
			
 
				+        |a.month as month,
			
 
				+        |a.vehicleid as vehicleid,
			
 
				+        |a.axlecount as axlecount,
			
 
				+        |a.vehicletype as vehicletype
			
 
				+        |from
			
 
				+        |(select month ,
			
 
				+        |vehicleid ,
			
 
				+        | axlecount ,
			
 
				+        | vehicletype ,
			
 
				+        | row_number() over(partition by month , vehicleid  order by exTime desc) as rr
			
 
				+        |from
			
 
				+        |trc_table) a where a.rr=1
			
 
				+        |""".stripMargin)
			
 
				+    // val trc_C_V_DataFrame: DataFrame = trcDataFrame.join(vehicleTypeAxlecount.select($"axlecount",$"vehicletype",$"day",$"vehicleid"), (trcDataFrame.col("vehicleid") === vehicleTypeAxlecount.col("vehicleid")).and(trcDataFrame.col("day") === vehicleTypeAxlecount.col("day")), "left")
			
 
				+    // 合并出口统计数据与用户轴数
			
 
				+    val trc_C_V_DataFrame: DataFrame = trcDataFrame.join(vehicleTypeAxlecount, Seq("month","vehicleid"), "left")
			
 
				+    //todo: 抽取出口数据集中的省份编码
			
 
				+    val trcProvinceDataSet: Dataset[Row] = trcDataSet_p.select($"month", $"vehicleid", $"travel_provinces").groupBy($"month", $"vehicleid", $"travel_provinces").count().withColumnRenamed("count","travel_provinces_count")
			
 
				+    trcProvinceDataSet.createOrReplaceTempView("trcProvince_table")
			
 
				+    val trcProvinceCountDataSet = spark.sql(
			
 
				+      """
			
 
				+        |select   month , vehicleid as vehicleid ,concat_ws("#",collect_list(concat_ws("%",travel_provinces,travel_provinces_count) )) as travel_provinces_count
			
 
				+        |from trcProvince_table
			
 
				+        |group by month , vehicleid
			
 
				+        |""".stripMargin)
			
 
				+    val trcTransProvinceCountDataFrame = trc_C_V_DataFrame.join(trcProvinceCountDataSet,
			
 
				+      Seq("month","vehicleid"),
			
 
				+      "outer")
			
 
				+
			
 
				+
			
 
				+    //统计每个牌照出行时间段的次数
			
 
				+    val transtimeDataFrame:DataFrame = gbupload_etctu.groupBy($"month",$"vehicleid",$"transtime")
			
 
				+      .agg(count("transtime").as("transtime_preferences_count"))
			
 
				+    // 汇总车辆各个时间段出行次数，变成了map结构
			
 
				+    transtimeDataFrame.createOrReplaceTempView("transtimeData")
			
 
				+    val transTimeCountMapDataFrame: DataFrame = spark.sql(
			
 
				+      """
			
 
				+        |select   month ,
			
 
				+        |vehicleid ,
			
 
				+        |concat_ws("#",collect_list(concat_ws("%",transtime,transtime_preferences_count) )) as transtime_count
			
 
				+        |from transtimeData
			
 
				+        |group by month , vehicleid
			
 
				+        |""".stripMargin)
			
 
				+    var trcResultDataFrame = trcTransProvinceCountDataFrame.join(transTimeCountMapDataFrame,
			
 
				+      Seq("month","vehicleid"),
			
 
				+      "outer")
			
 
				+    spark.sqlContext.dropTempTable("trcDataSet")
			
 
				+    spark.sqlContext.dropTempTable("trcProvince_table")
			
 
				+    spark.sqlContext.dropTempTable("transtimeData")
			
 
				+    spark.sqlContext.dropTempTable("gbupload_etctu")
			
 
				+    spark.sqlContext.dropTempTable("trc_table")
			
 
				+    log.info(trcResultDataFrame.printSchema())
			
 
				+    if (result_partitions> 0){
			
 
				+      trcResultDataFrame= trcResultDataFrame.repartition(result_partitions)
			
 
				+    }
			
 
				+    trcResultDataFrame.write.option("header",true).csv(out_path)
			
 
				+
			
 
				+    spark.stop()
			
 
				+
			
 
				+  }
			
 
				+  def formatSparkSession(): SparkSession ={
			
 
				+    // todo: 实例化SparkSession
			
 
				+    val spark: SparkSession = SparkSession
			
 
				+      .builder()
			
 
				+      //   .appName("DataExtract")
			
 
				+      //   .master("local[*]")
			
 
				+      .getOrCreate()
			
 
				+    spark
			
 
				+  }
			
 
				+
			
 
				+  def exists(path:String):Boolean={
			
 
				+    Files.exists(Paths.get(path))
			
 
				+  }
			
 
				+
			
 
				+
			
 
				+}
			
--- a/src/main/scala/com/aspirecn/jiaoke/data/extract/DataExtractMonthMerge.scala
+++ b/src/main/scala/com/aspirecn/jiaoke/data/extract/DataExtractMonthMerge.scala
@@ -0,0 +1,161 @@
 
				+package com.aspirecn.jiaoke.data.extract
			
 
				+
			
 
				+import java.sql.Date
			
 
				+
			
 
				+import com.aspirecn.jiaoke.data.extract.DataExtractMonth.{TrcExetcpu, exists}
			
 
				+import org.apache.log4j.{Level, Logger}
			
 
				+import org.apache.spark.sql.{DataFrame, SparkSession}
			
 
				+
			
 
				+/**
			
 
				+ * @auther: hwj
			
 
				+ * @date: 2021/4/25 13:48
			
 
				+ * @description:
			
 
				+ */
			
 
				+object DataExtractMonthMerge {
			
 
				+  case class monthResult(month:String,vehicleid:String,max_exTime:Date,sum_travel_time:String,max_travel_time:String,sum_feemileage:String,sum_fee:String,sum_weight_mileage:String,exTimes_count:String,axlecount:String,vehicletype:String,travel_provinces_count:String,transtime_count:String)
			
 
				+  def main(args: Array[String]): Unit = {
			
 
				+    val log: Logger = Logger.getLogger("DataExtractMonthMerge")
			
 
				+    log.setLevel(Level.INFO)
			
 
				+    if (args.size != 5){
			
 
				+      log.error(
			
 
				+        """
			
 
				+          |请正确传入程序参数：
			
 
				+          |                  1.1——10号数据读取目录
			
 
				+          |                  2.11——20号数据读取目录
			
 
				+          |                  3.21——31号数据读取目录
			
 
				+          |                  4.结果文件的输出路径（文件输出目录在执行不可存在，程序会自动进行创建，否则会报错）
			
 
				+          |                  5.结果文件输出个数，请根据数据量合理输入该参数，输出的文件越少，执行时间越长，输入0则为默认文件个数
			
 
				+          |""".stripMargin)
			
 
				+      System.exit(0)
			
 
				+    }
			
 
				+    val month_1:String =args(0)
			
 
				+    val month_2:String =args(1)
			
 
				+    val month_3:String =args(2)
			
 
				+    val out_path:String =args(3)
			
 
				+    val result_partitions:Int = Integer.valueOf(args(4))
			
 
				+    val spark:SparkSession = formatSparkSession
			
 
				+    // 当结果数据输出目录已存在时，结束程序
			
 
				+    if(exists(out_path)){
			
 
				+      log.error("数据输出目录已存在"+out_path)
			
 
				+      System.exit(0)
			
 
				+    }
			
 
				+
			
 
				+    //todo：开始处理数据
			
 
				+    import org.apache.spark.sql.functions._
			
 
				+    // todo: 引入隐式转换
			
 
				+    import spark.implicits._
			
 
				+    val trcMonthDataSet = spark.read.option("header", true).csv(month_1,month_2,month_3).as[monthResult]
			
 
				+    trcMonthDataSet.createOrReplaceTempView("trcMonthTable")
			
 
				+//    val provinceFrame:DataFrame = spark.sql(
			
 
				+//      """
			
 
				+//        |select
			
 
				+//        |month,
			
 
				+//        |vehicleid
			
 
				+//        |str_to_map(travel_provinces_count,'#','%') as travel_provinces_count,
			
 
				+//        |str_to_map(transtime_count,'#','%') as transtime_count
			
 
				+//        |from
			
 
				+//        |trcMonthTable
			
 
				+//        |""".stripMargin)
			
 
				+//    provinceFrame.createOrReplaceTempView("provinceFrame")
			
 
				+    val provinceFrame = spark.sql(
			
 
				+      """
			
 
				+        |select
			
 
				+        |a.month as month,
			
 
				+        |a.vehicleid as vehicleid,
			
 
				+        |a.travel_province as travel_province,
			
 
				+        |sum(a.travel_count) as travel_count
			
 
				+        |from
			
 
				+        |(select
			
 
				+        |month,
			
 
				+        |vehicleid,
			
 
				+        |split(travel_province_count,'%')[0] as travel_province,
			
 
				+        |split(travel_province_count,'%')[1] as travel_count
			
 
				+        |from
			
 
				+        |trcMonthTable
			
 
				+        |lateral view explode(split(travel_provinces_count, '#')) as travel_province_count ) as a
			
 
				+        |group by a.month,a.vehicleid,a.travel_province
			
 
				+        |
			
 
				+        |""".stripMargin)
			
 
				+    val transtimeFrame = spark.sql(
			
 
				+      """
			
 
				+        |select
			
 
				+        |a.month,
			
 
				+        |a.vehicleid,a.transtime,
			
 
				+        |sum(a.transtime_c) as transtime_c
			
 
				+        |from
			
 
				+        |(select
			
 
				+        |month,
			
 
				+        |vehicleid,
			
 
				+        |split(transtime_count_n,'%')[0] as transtime,
			
 
				+        |split(transtime_count_n,'%')[1] as transtime_c
			
 
				+        |from
			
 
				+        |trcMonthTable
			
 
				+        |lateral view explode(split(transtime_count, '#')) as transtime_count_n ) a
			
 
				+        |group by a.month,a.vehicleid,a.transtime
			
 
				+        |
			
 
				+        |""".stripMargin)
			
 
				+    provinceFrame.createOrReplaceTempView("provinceFrame")
			
 
				+    transtimeFrame.createOrReplaceTempView("transtimeFrame")
			
 
				+    val travel_provinces_countFrame = spark.sql(
			
 
				+    """
			
 
				+      |select month,vehicleid,
			
 
				+      |concat_ws("#",collect_list(concat_ws("%",travel_province,cast(travel_count  as int)) )) as travel_provinces_count
			
 
				+      |from
			
 
				+      |provinceFrame
			
 
				+      |group by month,vehicleid
			
 
				+      |""".stripMargin)
			
 
				+    val transtimecountFrame = spark.sql(
			
 
				+    """
			
 
				+      |select month,vehicleid,
			
 
				+      |concat_ws("#",collect_list(concat_ws("%",transtime,cast(transtime_c as int)) )) as transtime_count
			
 
				+      |from
			
 
				+      |transtimeFrame
			
 
				+      |group by month,vehicleid
			
 
				+      |""".stripMargin)
			
 
				+    travel_provinces_countFrame.show(10)
			
 
				+    transtimecountFrame.show(10)
			
 
				+    val  travelProvincesTranstimeCountDataFrame: DataFrame = travel_provinces_countFrame.join(transtimecountFrame,
			
 
				+      Seq("month", "vehicleid"),
			
 
				+      "outer")
			
 
				+    val vehicleidDataFrame:DataFrame = spark.sql(
			
 
				+    """
			
 
				+        |select
			
 
				+        |month, vehicleid,
			
 
				+        |max(max_exTime) as max_exTime,
			
 
				+        |sum(sum_travel_time) as sum_travel_time,
			
 
				+        |max(max_travel_time) as max_travel_time,
			
 
				+        |sum(sum_feemileage) as sum_feemileage,
			
 
				+        |sum(sum_fee) as sum_fee,
			
 
				+        |sum(sum_weight_mileage) as sum_weight_mileage,
			
 
				+        |sum(exTimes_count) as exTimes_count,
			
 
				+        |max(axlecount) as axlecount,
			
 
				+        |max(vehicletype) as vehicletype
			
 
				+        |from
			
 
				+        |trcMonthTable
			
 
				+        |group by month, vehicleid
			
 
				+        |""".stripMargin)
			
 
				+    vehicleidDataFrame.show(10)
			
 
				+    var trcResultDataFrame = vehicleidDataFrame.join(travelProvincesTranstimeCountDataFrame,
			
 
				+      Seq("month", "vehicleid"),
			
 
				+      "outer")
			
 
				+    trcResultDataFrame.show(10)
			
 
				+    trcResultDataFrame.printSchema()
			
 
				+    if (result_partitions> 0){
			
 
				+      trcResultDataFrame= trcResultDataFrame.repartition(result_partitions)
			
 
				+    }
			
 
				+    trcResultDataFrame.write.option("header",true).csv(out_path)
			
 
				+
			
 
				+
			
 
				+    spark.stop()
			
 
				+
			
 
				+  }
			
 
				+  def formatSparkSession(): SparkSession ={
			
 
				+    // todo: 实例化SparkSession
			
 
				+    val spark: SparkSession = SparkSession
			
 
				+      .builder()
			
 
				+      //   .appName("DataExtract")
			
 
				+    //  .master("local[*]")
			
 
				+      .getOrCreate()
			
 
				+    spark
			
 
				+  }
			
 
				+}
			
--- a/src/main/scala/com/aspirecn/jiaoke/data/extract/DataExtractNoMenjia.scala
+++ b/src/main/scala/com/aspirecn/jiaoke/data/extract/DataExtractNoMenjia.scala
@@ -0,0 +1,242 @@
 
				+package com.aspirecn.jiaoke.data.extract
			
 
				+
			
 
				+import java.nio.file.{Files, Paths}
			
 
				+import java.sql.Date
			
 
				+
			
 
				+import org.apache.log4j.{Level, Logger}
			
 
				+import org.apache.spark.sql.{DataFrame, Dataset, Row, SparkSession}
			
 
				+import org.apache.spark.storage.StorageLevel
			
 
				+
			
 
				+import scala.collection.immutable
			
 
				+
			
 
				+
			
 
				+/**
			
 
				+ * @auther: hwj
			
 
				+ * @date: 2021/4/13 10:39
			
 
				+ * @description:
			
 
				+ */
			
 
				+object DataExtractNoMenjia {
			
 
				+  case class TrcExetcpu(vehicleid:String, exTime:Date, enTime:Date,feemileage :String, shortfeemileage:String,fee:String, enweight:String, extolllaneid:String, axlecount:String, vehicletype:String, day:String  )
			
 
				+  case class GBUPLoad(vehicleid:String, transtime:String, day:String)
			
 
				+  def main(args: Array[String]): Unit = {
			
 
				+    val log: Logger = Logger.getLogger("DataExtractByReadFile")
			
 
				+    log.setLevel(Level.INFO)
			
 
				+    if (args.size != 5){
			
 
				+      log.error(
			
 
				+        """
			
 
				+          |请正确传入程序参数：
			
 
				+          |                  1.ETC数据读取目录
			
 
				+          |                  2.现金出口数据读取目录
			
 
				+          |                  3.结果文件的输出路径（文件输出目录在执行不可存在，程序会自动进行创建，否则会报错）
			
 
				+          |                  4.checkpoint目录
			
 
				+          |                  5.结果文件输出个数，请根据数据量合理输入该参数，输出的文件越少，执行时间越长，输入0则为默认文件个数
			
 
				+          |""".stripMargin)
			
 
				+      System.exit(0)
			
 
				+    }
			
 
				+
			
 
				+
			
 
				+    // etc数据读取目录
			
 
				+    val etcPath: String = args(0)
			
 
				+    // 现金出口数据读取目录
			
 
				+    val qitachukouPath: String = args(1)
			
 
				+    // 结果文件输出目录
			
 
				+    val out_path: String = args(2)
			
 
				+    // checkpoint目录
			
 
				+    val check_path: String = args(3)
			
 
				+    // 结果文件输出个数
			
 
				+    val result_partitions: Int = Integer.valueOf(args(4))
			
 
				+
			
 
				+
			
 
				+
			
 
				+
			
 
				+    // 当数据读取目录不存在时，结束程序
			
 
				+    if((!exists(etcPath))||(!exists(qitachukouPath))){
			
 
				+      log.error("请输入正确的文件目录:" +
			
 
				+        "etc数据路径:"+etcPath +
			
 
				+        "现金出口数据路径:"+qitachukouPath)
			
 
				+      System.exit(0)
			
 
				+    }
			
 
				+    // 当结果数据输出目录已存在时，结束程序
			
 
				+    if(exists(out_path)){
			
 
				+      log.error("数据输出目录已存在"+out_path)
			
 
				+      System.exit(0)
			
 
				+    }
			
 
				+    // 初始化SparkSession
			
 
				+    val spark: SparkSession = formatSparkSession
			
 
				+    //设置checkpoint
			
 
				+    spark.sparkContext.setCheckpointDir(check_path)
			
 
				+    // spark.sparkContext.setCheckpointDir("file:///data/checkpoint")
			
 
				+
			
 
				+    log.info("开始抽取数据")
			
 
				+    //todo：开始处理数据
			
 
				+    import org.apache.spark.sql.functions._
			
 
				+    // todo: 引入隐式转换
			
 
				+    import spark.implicits._
			
 
				+    //etc数据
			
 
				+    log.info("开始抽取etc出口数据，抽取路径为："+etcPath)
			
 
				+    val trcExetcpuDataset = spark.read.option("header", true).csv(etcPath).toDF("vehicleid","extime","entime","feemileage","shortfeemileage","fee","enweight","extolllaneid","axlecount","vehicletype","day").as[TrcExetcpu].coalesce(36)
			
 
				+    log.info("抽取etc出口数据成功")
			
 
				+    // 其他出口数据
			
 
				+    log.info("抽取开始抽取现金出口数据，抽取路径为："+qitachukouPath)
			
 
				+    val trcOtuDataset = spark.read.option("header", true).csv(qitachukouPath).toDF("vehicleid","extime","entime","feemileage","shortfeemileage","fee","enweight","extolllaneid","axlecount","vehicletype","day").as[TrcExetcpu].coalesce(36)
			
 
				+    log.info("现金出口数据成功")
			
 
				+
			
 
				+
			
 
				+
			
 
				+
			
 
				+    //合并etc出口和现金出口的数据
			
 
				+    log.info("开始合并etc出口和现金出口数据")
			
 
				+    val trcDataSet: Dataset[TrcExetcpu] = trcExetcpuDataset.unionByName(trcOtuDataset)
			
 
				+    log.info("合并etc出口和现金出口数据成功")
			
 
				+    // 创建视图 对出口数据进行处理
			
 
				+    trcDataSet.createOrReplaceTempView("trcDataSet")
			
 
				+    log.info("创建视图，开始统计出口相关数据")
			
 
				+    val trcDataSet_p: DataFrame = spark.sql(
			
 
				+      """
			
 
				+        |select
			
 
				+        |vehicleid,
			
 
				+        |exTime,
			
 
				+        |entime,
			
 
				+        |(case when entime is not null  then    unix_timestamp(regexp_replace(extime,'T',' '))-unix_timestamp(regexp_replace(entime,'T',' ')) else 0 end  ) as travel_time,
			
 
				+        |(case when feemileage>0 and feemileage is not null then feemileage else shortfeemileage   end) as feemileage,
			
 
				+        | cast(fee as double) as fee,
			
 
				+        |(case when enweight >0 then ( case when feemileage>0 and feemileage is not null then feemileage else shortfeemileage end)   else 0 end ) as weight_mileage,
			
 
				+        |substr(exTollLaneId,6,2) as travel_provinces,
			
 
				+        |axlecount,
			
 
				+        |vehicletype,
			
 
				+        |day
			
 
				+        |from
			
 
				+        |trcDataSet
			
 
				+        |
			
 
				+        |""".stripMargin)
			
 
				+    log.info("出口相关数据统计完成")
			
 
				+    // 取出出口表的 出口交易时间和入口交易时间，与门架表进行合并
			
 
				+    log.info("开始抽取出口数据中的出口交易时间")
			
 
				+    val ex_transtime: Dataset[GBUPLoad] = trcDataSet_p.select($"day", $"vehicleid".as("vehicleid"), $"exTime".as("transtime")).as[GBUPLoad]
			
 
				+    log.info("抽取出口数据中的出口交易时间成功")
			
 
				+    log.info("开始抽取出口数据中的入口交易时间")
			
 
				+    val en_transtime: Dataset[GBUPLoad] = trcDataSet_p.select($"day", $"vehicleid".as("vehicleid"),$"enTime".as("transtime")).as[GBUPLoad]
			
 
				+    log.info("抽取出口数据中的入口交易时间成功")
			
 
				+    log.info("开始合并入口交易时间和出口交易时间")
			
 
				+    val exetcpuDataset: Dataset[GBUPLoad] = ex_transtime.unionByName(en_transtime)
			
 
				+    log.info("合并交易时间成功")
			
 
				+    trcDataSet_p.persist(StorageLevel.MEMORY_AND_DISK)
			
 
				+    trcDataSet_p.checkpoint()
			
 
				+    trcDataSet_p.unpersist()
			
 
				+    // 创建视图 对用户出行数据进行分段统计次数
			
 
				+    exetcpuDataset.createOrReplaceTempView("gbupload_etctu")
			
 
				+    val gbupload_etctu: Dataset[GBUPLoad] = spark.sql(
			
 
				+      """
			
 
				+        |select vehicleid as vehicleid,
			
 
				+        |(case
			
 
				+        |   when 0 <= hour(transtime) and hour(transtime)<=3 then 'A'
			
 
				+        |   when 3 < hour(transtime) and hour(transtime)<=6 then 'B'
			
 
				+        |   when 6 < hour(transtime) and hour(transtime)<=9 then 'C'
			
 
				+        |   when 9 < hour(transtime) and hour(transtime)<=12 then 'D'
			
 
				+        |   when 12 < hour(transtime) and hour(transtime)<=15 then 'E'
			
 
				+        |   when 15 < hour(transtime) and hour(transtime)<=18 then 'F'
			
 
				+        |   when 18 < hour(transtime) and hour(transtime)<=21 then 'G'
			
 
				+        |   when 21 < hour(transtime) and hour(transtime)<24 then 'H'
			
 
				+        |   else 'I' end) as transtime,
			
 
				+        |day
			
 
				+        |from
			
 
				+        |gbupload_etctu
			
 
				+        |""".stripMargin).as[GBUPLoad]
			
 
				+    gbupload_etctu.persist(StorageLevel.MEMORY_AND_DISK)
			
 
				+    gbupload_etctu.checkpoint()
			
 
				+    gbupload_etctu.unpersist()
			
 
				+    spark.sqlContext.dropTempTable("gbupload_etctu")
			
 
				+
			
 
				+    //todo：根据车牌号进行分组聚合，统计最近一次出行时间、出行总时长、最长出行时间、出行总里程、出行花费总金额、负重出行总里程、出行次数
			
 
				+    //合并etc出口和现金出口的数据
			
 
				+    val trcDataFrame =  trcDataSet_p
			
 
				+      .groupBy($"day", $"vehicleid")
			
 
				+      .agg(immutable.Map("exTime" -> "max", "travel_time" -> "sum", "travel_time" -> "max", "feemileage" -> "sum", "fee" -> "sum", "weight_mileage" -> "sum", "vehicleid" -> "count"))
			
 
				+      .withColumnRenamed("max(exTime)","max_exTime")
			
 
				+      .withColumnRenamed("max(travel_time)","max_travel_time")
			
 
				+      .withColumnRenamed("sum(travel_time)","sum_travel_time")
			
 
				+      .withColumnRenamed("sum(feemileage)","sum_feemileage")
			
 
				+      .withColumnRenamed("sum(fee)","sum_fee")
			
 
				+      .withColumnRenamed("sum(weight_mileage)","sum_weight_mileage")
			
 
				+      .withColumnRenamed("count(vehicleid)","exTimes_count")
			
 
				+    //todo： 取车辆轴数和收费类型
			
 
				+    trcDataSet_p.createOrReplaceTempView("trc_table")
			
 
				+    val vehicleTypeAxlecount: DataFrame = spark.sql(
			
 
				+      """
			
 
				+        |select
			
 
				+        |a.day as day,
			
 
				+        |a.vehicleid as vehicleid,
			
 
				+        |a.axlecount as axlecount,
			
 
				+        |a.vehicletype as vehicletype
			
 
				+        |from
			
 
				+        |(select day ,
			
 
				+        |vehicleid ,
			
 
				+        | axlecount ,
			
 
				+        | vehicletype ,
			
 
				+        | row_number() over(partition by day , vehicleid  order by exTime desc) as rr
			
 
				+        |from
			
 
				+        |trc_table) a where a.rr=1
			
 
				+        |""".stripMargin)
			
 
				+    // val trc_C_V_DataFrame: DataFrame = trcDataFrame.join(vehicleTypeAxlecount.select($"axlecount",$"vehicletype",$"day",$"vehicleid"), (trcDataFrame.col("vehicleid") === vehicleTypeAxlecount.col("vehicleid")).and(trcDataFrame.col("day") === vehicleTypeAxlecount.col("day")), "left")
			
 
				+    // 合并出口统计数据与用户轴数
			
 
				+    val trc_C_V_DataFrame: DataFrame = trcDataFrame.join(vehicleTypeAxlecount, Seq("day","vehicleid"), "left")
			
 
				+    //todo: 抽取出口数据集中的省份编码
			
 
				+    val trcProvinceDataSet: Dataset[Row] = trcDataSet_p.select($"day", $"vehicleid", $"travel_provinces").groupBy($"day", $"vehicleid", $"travel_provinces").count().withColumnRenamed("count","travel_provinces_count")
			
 
				+    trcProvinceDataSet.createOrReplaceTempView("trcProvince_table")
			
 
				+    val trcProvinceCountDataSet = spark.sql(
			
 
				+      """
			
 
				+        |select   day , vehicleid as vehicleid ,concat_ws("#",collect_list(concat_ws("%",travel_provinces,travel_provinces_count) )) as travel_provinces_count
			
 
				+        |from trcProvince_table
			
 
				+        |group by day , vehicleid
			
 
				+        |""".stripMargin)
			
 
				+    val trcTransProvinceCountDataFrame = trc_C_V_DataFrame.join(trcProvinceCountDataSet,
			
 
				+      Seq("day","vehicleid"),
			
 
				+      "outer")
			
 
				+
			
 
				+
			
 
				+    //统计每个牌照出行时间段的次数
			
 
				+    val transtimeDataFrame:DataFrame = gbupload_etctu.groupBy($"day",$"vehicleid",$"transtime")
			
 
				+      .agg(count("transtime").as("transtime_preferences_count"))
			
 
				+    // 汇总车辆各个时间段出行次数，变成了map结构
			
 
				+    transtimeDataFrame.createOrReplaceTempView("transtimeData")
			
 
				+    val transTimeCountMapDataFrame: DataFrame = spark.sql(
			
 
				+      """
			
 
				+        |select   day ,
			
 
				+        |vehicleid ,
			
 
				+        |concat_ws("#",collect_list(concat_ws("%",transtime,transtime_preferences_count) )) as transtime_count
			
 
				+        |from transtimeData
			
 
				+        |group by day , vehicleid
			
 
				+        |""".stripMargin)
			
 
				+    var trcResultDataFrame = trcTransProvinceCountDataFrame.join(transTimeCountMapDataFrame,
			
 
				+      Seq("day","vehicleid"),
			
 
				+      "outer")
			
 
				+    spark.sqlContext.dropTempTable("trcDataSet")
			
 
				+    spark.sqlContext.dropTempTable("trcProvince_table")
			
 
				+    spark.sqlContext.dropTempTable("transtimeData")
			
 
				+    spark.sqlContext.dropTempTable("gbupload_etctu")
			
 
				+    spark.sqlContext.dropTempTable("trc_table")
			
 
				+    log.info(trcResultDataFrame.printSchema())
			
 
				+    if (result_partitions> 0){
			
 
				+      trcResultDataFrame= trcResultDataFrame.repartition(result_partitions)
			
 
				+    }
			
 
				+    trcResultDataFrame.write.csv(out_path)
			
 
				+
			
 
				+    spark.stop()
			
 
				+
			
 
				+  }
			
 
				+  def formatSparkSession(): SparkSession ={
			
 
				+    // todo: 实例化SparkSession
			
 
				+    val spark: SparkSession = SparkSession
			
 
				+      .builder()
			
 
				+      //   .appName("DataExtract")
			
 
				+      //   .master("local[*]")
			
 
				+      .getOrCreate()
			
 
				+    spark
			
 
				+  }
			
 
				+
			
 
				+  def exists(path:String):Boolean={
			
 
				+    Files.exists(Paths.get(path))
			
 
				+  }
			
 
				+
			
 
				+
			
 
				+}