diff --git a/.github/workflows/java-ci.yaml b/.github/workflows/java-ci.yaml index eab1b210..1f603c0b 100644 --- a/.github/workflows/java-ci.yaml +++ b/.github/workflows/java-ci.yaml @@ -53,14 +53,6 @@ jobs: with: submodules: recursive - - name: Cache for ccache - uses: actions/cache@v3 - with: - path: ~/.ccache - key: ${{ runner.os }}-${{ matrix.metadata }}-ccache-${{ hashFiles('**/git-modules.txt') }} - restore-keys: | - ${{ runner.os }}-${{ matrix.metadata }}-ccache- - - name: Install Dependencies for Linux if: runner.os == 'Linux' run: | @@ -173,12 +165,73 @@ jobs: # wait for hive docker ready sleep 60 + - name: Hive test + run: | + pushd java/hive/test + ./test.sh + popd + + - name: Spark with hive test + run: | + pushd java/hive/test + ./spark-hive-test.sh + popd + + - name: Stop hive docker + run: | + pushd java/hive/docker + docker-compose -f docker-compose.yaml stop + docker-compose -f docker-compose.yaml rm -f + popd + + - name: Build mysql container + run: | + pushd java/hive/docker/dependency/mysql + docker-compose -f ./mysql-compose.yaml up -d + popd + + - name: Start vineyard server for hive distributed test + run: | + ./build/bin/vineyardd --socket=./build/vineyard_sock/metastore/vineyard.sock -rpc_socket_port=18880 --etcd_endpoint="0.0.0.0:2383" & + ./build/bin/vineyardd --socket=./build/vineyard_sock/hiveserver/vineyard.sock -rpc_socket_port=18881 --etcd_endpoint="0.0.0.0:2383" & + ./build/bin/vineyardd --socket=./build/vineyard_sock/0/vineyard.sock -rpc_socket_port=18882 --etcd_endpoint="0.0.0.0:2383" & + ./build/bin/vineyardd --socket=./build/vineyard_sock/1/vineyard.sock -rpc_socket_port=18883 --etcd_endpoint="0.0.0.0:2383" & + ./build/bin/vineyardd --socket=./build/vineyard_sock/2/vineyard.sock -rpc_socket_port=18884 --etcd_endpoint="0.0.0.0:2383" & + + - name: Build hadoop cluster + run: | + pushd java/hive/docker + docker-compose -f docker-compose-distributed.yaml up -d + popd + + # wait for hive docker ready + sleep 60 + + - name: Hive distributed test + run: | + pushd java/hive/test + ./distributed-test.sh + popd + + - name: Spark with hive distribued test + run: | + pushd java/hive/test + ./spark-hive-distributed-test.sh + popd + - name: Setup tmate session if: false uses: mxschmitt/action-tmate@v3 - - name: Hive test + - name: Stop container run: | - pushd java/hive/test - ./test.sh + pushd java/hive/docker + docker-compose -f docker-compose-distributed.yaml stop + docker-compose -f docker-compose-distributed.yaml rm -f popd + + pushd java/hive/docker/dependency/mysql + docker-compose -f ./mysql-compose.yaml stop + docker-compose -f ./mysql-compose.yaml rm -f + popd + diff --git a/.gitignore b/.gitignore index 50462dfe..7b1fb812 100644 --- a/.gitignore +++ b/.gitignore @@ -27,8 +27,8 @@ cmake-build-debug *.whl # hive sql work directory -/java/hive/distributed/docker/mysql/conf/ -/java/hive/distributed/docker/mysql/data/ +/java/hive/docker/dependency/mysql/conf/ +/java/hive/docker/dependency/mysql/data/ # coredump core.*.* diff --git a/java/hive/README.rst b/java/hive/README.rst index d0053f7e..7381a002 100644 --- a/java/hive/README.rst +++ b/java/hive/README.rst @@ -306,7 +306,7 @@ Build Hive Docker Image with Hadoop ### Build docker images ```bash - cd v6d/java/hive/distributed + cd v6d/java/hive/docker ./build.sh ``` @@ -322,25 +322,6 @@ Build Hive Docker Image with Hadoop # You can change the password in mysql-compose.yaml and hive-site.xml ``` -### Run hadoop & hive docker images -```bash - cd v6d/java/hive/docker - docker-compose -f docker-compose-distributed.yaml up -d -``` - -### Create table -```bash - docker exec -it hive-hiveserver2 beeline -u "jdbc:hive2://hive-hiveserver2:10000" -n root -``` - -```sql - -- in beeline - drop table test_hive1; - create table test_hive1(field int); - insert into table test_hive1 values (1),(2),(3),(4),(5),(6),(7),(8),(9),(10); - select * from test_hive1; -``` - Using vineyard as storage ----------------- @@ -366,23 +347,27 @@ Using vineyard as storage ### Copy vineyard jars to share dir ```bash - mkdir -p ~/share + mkdir -p v6d/share cd v6d/java/hive # you can change share dir in docker-compose.yaml - cp target/vineyard-hive-0.1-SNAPSHOT.jar ~/share + cp target/vineyard-hive-0.1-SNAPSHOT.jar ../../../share ``` -### Create table with vineyard +### Run hadoop & hive docker images +```bash + cd v6d/java/hive/docker + docker-compose -f docker-compose-distributed.yaml up -d +``` + +### Create table ```bash docker exec -it hive-hiveserver2 beeline -u "jdbc:hive2://hive-hiveserver2:10000" -n root ``` ```sql -- in beeline - drop table test_vineyard; - create table test_vineyard(field int) - stored as Vineyard - location "vineyard:///user/hive_remote/warehouse/test_vineyard"; - insert into table test_vineyard values (1),(2),(3),(4),(5),(6),(7),(8),(9),(10); - select * from test_vineyard; + drop table test_hive; + create table test_hive(field int); + insert into table test_hive values (1),(2),(3),(4),(5),(6),(7),(8),(9),(10); + select * from test_hive; ``` diff --git a/java/hive/docker/README.rst b/java/hive/docker/README.rst deleted file mode 100644 index 1e26814e..00000000 --- a/java/hive/docker/README.rst +++ /dev/null @@ -1,98 +0,0 @@ -Build Hive Docker Image with Hadoop ------------------ - -### Prepare vineyard jars -```bash - # Currently, the vineyard jar cannot run directly on hive because of - # dependency conflicts. You can run it temporarily by reverting to an - # older version of guava (such as 14.0.1) dependent by vineyard. - # This problem will be fixed in the future. - mvn clean package -``` - -### Build docker images -```bash - cd v6d/java/hive/docker - ./build.sh -``` - -### Create network -```bash - docker network create hadoop-network -``` - -### Start sql server for hive metastore -```bash - cd v6d/java/hive/docker/dependency/mysql - docker-compose -f mysql-compose.yaml up -d - # You can change the password in mysql-compose.yaml and hive-site.xml -``` - -### Run hadoop & hive docker images -```bash - cd v6d/java/hive/docker - docker-compose -f docker-compose-distributed.yaml up -d -``` - -### Prepare tez jars -```bash - docker exet -it hive-metastore bash - # in docker - hdfs dfs -put /tez.tar.gz / - exit -``` - -### Restart all services -```bash - cd v6d/java/hive/distributed/docker - docker-compose -f docker-compose.yaml restart -``` - -### Create table -```bash - docker exec -it hive-hiveserver2 beeline -u "jdbc:hive2://hive-hiveserver2:10000" -n root -``` - -Using vineyard as storage ------------------ - -### Run vineyardd -```bash - cd v6d/build - - # at terminal 1 - ./bin/vineyardd --socket=~/vineyard_sock/0/vineyard.sock -rpc_socket_port=9601 --etcd_endpoint="0.0.0.0:2382" - - # at terminal 2 - ./bin/vineyardd --socket=~/vineyard_sock/1/vineyard.sock -rpc_socket_port=9602 --etcd_endpoint="0.0.0.0:2382" - - # at terminal 3 - ./bin/vineyardd --socket=~/vineyard_sock/2/vineyard.sock -rpc_socket_port=9603 --etcd_endpoint="0.0.0.0:2382" - - # at terminal 4 - ./bin/vineyardd --socket=~/vineyard_sock/metastore/vineyard.sock -rpc_socket_port=9604 --etcd_endpoint="0.0.0.0:2382" - - # at terminal 5 - ./bin/vineyardd --socket=~/vineyard_sock/hiveserver/vineyard.sock -rpc_socket_port=9605 --etcd_endpoint="0.0.0.0:2382" -``` - -### Copy vineyard jars to share dir -```bash - mkdir -p ~/share - cd v6d/java/hive - # you can change share dir in docker-compose.yaml - cp target/vineyard-hive-0.1-SNAPSHOT.jar ~/share -``` - -### Create table with vineyard -```bash - docker exec -it hive-hiveserver2 beeline -u "jdbc:hive2://hive-hiveserver2:10000" -n root -``` - -```sql - -- in beeline - drop table test_vineyard; - create table test_vineyard(field int); - insert into table test_vineyard values (1),(2),(3),(4),(5),(6),(7),(8),(9),(10); - select * from test_vineyard; -``` \ No newline at end of file diff --git a/java/hive/docker/build.sh b/java/hive/docker/build.sh index 10e40b8c..5265c2f6 100755 --- a/java/hive/docker/build.sh +++ b/java/hive/docker/build.sh @@ -1,10 +1,11 @@ WORK_DIR=~/hive-workdir mkdir -p "$WORK_DIR" -find ~/hive-workdir -maxdepth 1 -mindepth 1 ! -name '*.tar.gz' -exec rm -rf {} \; +find ~/hive-workdir -maxdepth 1 -mindepth 1 ! \( -name '*.tar.gz' -o -name '*.tgz' \) -exec rm -rf {} \; TEZ_VERSION=${TEZ_VERSION:-"0.9.1"} HIVE_VERSION=${HIVE_VERSION:-"2.3.9"} +SPARK_VERSION=${SPARK_VERSION:-"3.4.1"} if [ -f "$WORK_DIR/apache-tez-$TEZ_VERSION-bin.tar.gz" ]; then echo "Tez exists, skipping download..." @@ -42,13 +43,36 @@ else fi fi +if [ -f "$WORK_DIR/spark-$SPARK_VERSION-bin-hadoop3.tgz" ]; then + echo "Spark exists, skipping download..." +else + echo "Download Spark..." + SPARK_URL=${SPARK_URL:-"https://archive.apache.org/dist/spark/spark-$SPARK_VERSION/spark-$SPARK_VERSION-bin-hadoop3.tgz"} + echo "Downloading Spark from $SPARK_URL..." + if ! curl --fail -L "$SPARK_URL" -o "$WORK_DIR/spark-$SPARK_VERSION-bin-hadoop3.tgz"; then + echo "Failed to download Spark, exiting..." + exit 1 + fi +fi + cp -R ./dependency/images/ "$WORK_DIR/" cp ../target/vineyard-hive-0.1-SNAPSHOT.jar "$WORK_DIR/images/" tar -xzf "$WORK_DIR/apache-hive-$HIVE_VERSION-bin.tar.gz" -C "$WORK_DIR/" tar -xzf "$WORK_DIR/apache-tez-$TEZ_VERSION-bin.tar.gz" -C "$WORK_DIR/" +tar -xzf "$WORK_DIR/spark-$SPARK_VERSION-bin-hadoop3.tgz" -C "$WORK_DIR/" mv "$WORK_DIR/apache-hive-$HIVE_VERSION-bin" "$WORK_DIR/images/hive" mv "$WORK_DIR/apache-tez-$TEZ_VERSION-bin" "$WORK_DIR/images/tez" +mv "$WORK_DIR/spark-$SPARK_VERSION-bin-hadoop3" "$WORK_DIR/images/spark" + +network_name="hadoop-network" + +if [[ -z $(docker network ls --filter name=^${network_name}$ --format="{{.Name}}") ]]; then + echo "Docker network ${network_name} does not exist, creating it..." + docker network create hadoop-network +else + echo "Docker network ${network_name} already exists" +fi docker build \ "$WORK_DIR/images" \ diff --git a/java/hive/docker/dependency/images/Dockerfile b/java/hive/docker/dependency/images/Dockerfile index 6307eb4b..e0a9bafc 100755 --- a/java/hive/docker/dependency/images/Dockerfile +++ b/java/hive/docker/dependency/images/Dockerfile @@ -24,7 +24,14 @@ COPY hive /opt/apache/hive/ COPY hive-config/ /hive-config COPY hive-config-distributed/ /hive-config-distributed -ENV PATH=${HADOOP_HOME}/bin:${HADOOP_HOME}/sbin:${HIVE_HOME}/bin:${PATH} +# prepare spark +COPY spark /opt/apache/spark/ +ENV SPARK_HOME=/opt/apache/spark +COPY spark-config/ /spark-config +COPY spark-config-distributed/ /spark-config-distributed +COPY ./vineyard-hive-0.1-SNAPSHOT.jar ${SPARK_HOME}/jars/ + +ENV PATH=${HADOOP_HOME}/bin:${HADOOP_HOME}/sbin:${HIVE_HOME}/bin:${SPARK_HOME}/bin:${PATH} COPY bootstrap.sh /opt/apache/ COPY mysql-connector-java-5.1.49/mysql-connector-java-5.1.49-bin.jar ${HIVE_HOME}/lib/ @@ -35,4 +42,5 @@ RUN sudo yum -y install unzip; \ sudo rm /usr/lib64/libstdc++.so.6; \ sudo ln -s /usr/lib64/libstdc++.so.6.0.26 /usr/lib64/libstdc++.so.6; \ sudo yum -y install vim; \ - rm libstdc.so_.6.0.26.zip libstdc++.so.6.0.26; + rm libstdc.so_.6.0.26.zip libstdc++.so.6.0.26; \ + sudo yum install -y net-tools diff --git a/java/hive/docker/dependency/images/hive-config/hive-site.xml b/java/hive/docker/dependency/images/hive-config/hive-site.xml index 68dd37c1..7ece0476 100644 --- a/java/hive/docker/dependency/images/hive-config/hive-site.xml +++ b/java/hive/docker/dependency/images/hive-config/hive-site.xml @@ -112,4 +112,8 @@ hive.metastore.client.connect.retry.delay 5s + + hive.metastore.uris + thrift://metastore:9083 + diff --git a/java/hive/docker/dependency/images/spark-config-distributed/spark-defaults.conf b/java/hive/docker/dependency/images/spark-config-distributed/spark-defaults.conf new file mode 100644 index 00000000..75b03864 --- /dev/null +++ b/java/hive/docker/dependency/images/spark-config-distributed/spark-defaults.conf @@ -0,0 +1 @@ +spark.yarn.stagingDir=file:///tmp/ diff --git a/java/hive/docker/dependency/images/spark-config/spark-defaults.conf b/java/hive/docker/dependency/images/spark-config/spark-defaults.conf new file mode 100644 index 00000000..b8c0cf29 --- /dev/null +++ b/java/hive/docker/dependency/images/spark-config/spark-defaults.conf @@ -0,0 +1,54 @@ +# +# Licensed to the Apache Software Foundation (ASF) under one or more +# contributor license agreements. See the NOTICE file distributed with +# this work for additional information regarding copyright ownership. +# The ASF licenses this file to You under the Apache License, Version 2.0 +# (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +# Default system properties included when running spark-submit. +# This is useful for setting default environmental settings. + +# Example: +# spark.master spark://master:7077 +# spark.eventLog.enabled true +# spark.eventLog.dir hdfs://namenode:8021/directory +# spark.serializer org.apache.spark.serializer.KryoSerializer +# spark.driver.memory 5g +# spark.executor.extraJavaOptions -XX:+PrintGCDetails -Dkey=value -Dnumbers="one two three" + +# spark.sql.inMemoryColumnarStorage.enableVectorizedReader=false +# spark.sql.orc.enableNestedColumnVectorizedReader=false +# spark.sql.orc.enableVectorizedReader=false +# spark.sql.parquet.enableNestedColumnVectorizedReader=true +# spark.sql.parquet.enableVectorizedReader=true +# spark.hive.default.fileformat=Vineyard +# spark.hive.metastore.warehouse.dir=vineyard:///opt/hive/data/warehouse +# spark.sql.orc.impl=native +# spark.sql.hive.convertMetastoreOrc=true +# spark.hive.vectorized.execution.enabled=true +# spark.hive.fetch.task.conversion=none +# spark.hive.vectorized.use.vectorized.input.format=true +# spark.hive.vectorized.use.row.serde.deserialize=false +# spark.hive.vectorized.use.vector.serde.deserialize=true +# spark.hive.vectorized.execution.reduce.enabled=true +# spark.hive.exec.dynamic.partition.mode=nonstrict +# spark.hive.metastore.sasl.enabled=false +# spark.hive.server2.authentication=NOSASL +# spark.hive.metastore.execute.setugi=false +# spark.hive.metastore.warehouse.dir=/opt/hive/data/warehouse +# spark.sql.warehouse.dir=/opt/hive/data/warehouse +# spark.sql.hive.metastore.version=2.3.9 +# spark.sql.hive.metastore.jars=path +# spark.sql.hive.metastore.jars.path=/opt/apache/hive/lib/*,/auxlib/* + +# spark.sql.catalogImplementation=hive diff --git a/java/hive/docker/docker-compose-distributed.yaml b/java/hive/docker/docker-compose-distributed.yaml index c7bca852..065ad889 100755 --- a/java/hive/docker/docker-compose-distributed.yaml +++ b/java/hive/docker/docker-compose-distributed.yaml @@ -16,8 +16,7 @@ services: volumes: - ~/hive-tmp:/tmp/ - ~/hive-user:/user/ - - ~/vineyard_sock/0/:/tmp/vineyard_sock - - ~/share:/auxlib/ + - ../../../share:/auxlib/ healthcheck: test: ["CMD-SHELL", "netstat -tnlp|grep :${HADOOP_YARN_RM_PORT} || exit 1"] interval: 20s @@ -46,8 +45,8 @@ services: volumes: - ~/hive-tmp:/tmp/ - ~/hive-user:/user/ - - ~/vineyard_sock/0/:/tmp/vineyard_sock - - ~/share:/auxlib/ + - ../../../build/vineyard_sock/0/:/tmp/vineyard_sock + - ../../../share:/auxlib/ environment: VINEYARD_IPC_SOCKET: /tmp/vineyard_sock/vineyard.sock hadoop-yarn-nm-1: @@ -73,8 +72,8 @@ services: volumes: - ~/hive-tmp:/tmp/ - ~/hive-user:/user/ - - ~/vineyard_sock/1/:/tmp/vineyard_sock - - ~/share:/auxlib/ + - ../../../build/vineyard_sock/1/:/tmp/vineyard_sock + - ../../../share:/auxlib/ environment: VINEYARD_IPC_SOCKET: /tmp/vineyard_sock/vineyard.sock hadoop-yarn-nm-2: @@ -100,8 +99,8 @@ services: volumes: - ~/hive-tmp:/tmp/ - ~/hive-user:/user/ - - ~/vineyard_sock/2/:/tmp/vineyard_sock - - ~/share:/auxlib/ + - ../../../build/vineyard_sock/2/:/tmp/vineyard_sock + - ../../../share:/auxlib/ environment: VINEYARD_IPC_SOCKET: /tmp/vineyard_sock/vineyard.sock hadoop-yarn-proxyserver: @@ -122,7 +121,7 @@ services: volumes: - ~/hive-tmp:/tmp/ - ~/hive-user:/user/ - - ~/share:/auxlib/ + - ../../../share:/auxlib/ - ~/hive-test:/test/ healthcheck: test: ["CMD-SHELL", "netstat -tnlp|grep :${HADOOP_YARN_PROXYSERVER_PORT} || exit 1"] @@ -147,8 +146,7 @@ services: volumes: - ~/hive-tmp:/tmp/ - ~/hive-user:/user/ - - ~/vineyard_sock/historyserver/:/tmp/vineyard_sock - - ~/share:/auxlib/ + - ../../../share:/auxlib/ healthcheck: test: ["CMD-SHELL", "netstat -tnlp|grep :${HADOOP_MR_HISTORYSERVER_PORT} || exit 1"] interval: 30s @@ -176,8 +174,8 @@ services: volumes: - ~/hive-tmp:/tmp/ - ~/hive-user:/user/ - - ~/vineyard_sock/metastore/:/tmp/vineyard_sock - - ~/share:/auxlib/ + - ../../../build/vineyard_sock/metastore/:/tmp/vineyard_sock + - ../../../share:/auxlib/ environment: VINEYARD_IPC_SOCKET: /tmp/vineyard_sock/vineyard.sock hive-hiveserver2: @@ -203,8 +201,8 @@ services: volumes: - ~/hive-tmp:/tmp/ - ~/hive-user:/user/ - - ~/vineyard_sock/hiveserver/:/tmp/vineyard_sock - - ~/share:/auxlib/ + - ../../../build/vineyard_sock/hiveserver/:/tmp/vineyard_sock + - ../../../share:/auxlib/ environment: VINEYARD_IPC_SOCKET: /tmp/vineyard_sock/vineyard.sock diff --git a/java/hive/docker/docker-compose.yaml b/java/hive/docker/docker-compose.yaml index c1c3ba54..6fc8a833 100644 --- a/java/hive/docker/docker-compose.yaml +++ b/java/hive/docker/docker-compose.yaml @@ -59,6 +59,23 @@ services: - ../../../build/vineyard:/tmp/vineyard - ../../../share:/opt/hive/auxlib + spark: + image: apache/hadoop_hive:v1 + user: "root:root" + depends_on: + - metastore + container_name: spark + networks: + - hive + command: "tail -f /dev/null" + volumes: + - /user/hive/warehouse:/opt/hive/data/warehouse + - /user/hive/warehouse:/user/hive/warehouse + - ../../../java/hive/conf/hive-site.xml:/opt/hive/conf/hive-site.xml + - ../../../build/vineyard:/tmp/vineyard + - ../../../share:/auxlib + environment: + VINEYARD_IPC_SOCKET: /tmp/vineyard/vineyard.sock networks: hive: diff --git a/java/hive/test/distributed-test.sh b/java/hive/test/distributed-test.sh new file mode 100755 index 00000000..d2b99676 --- /dev/null +++ b/java/hive/test/distributed-test.sh @@ -0,0 +1,40 @@ +outdir=./query/out +if [ -d "$outdir" ]; then + rm -r "$outdir" +fi +docker cp ./query hive-hiveserver2:/tmp/ + +for file in ./query/*; do + query=$(basename "$file") + docker exec hive-hiveserver2 beeline -u 'jdbc:hive2://localhost:10000/;transportMode=https;httpPath=cliservice' \ + -f /tmp/query/"$query" -n root +done + +docker cp hive-hiveserver2:/tmp/out ./query/ +for dir in ./query/out/*; do + cat $dir/* > ./query/out/$(basename "$dir").q.out + rm -r $dir +done + +filecount=$(find ./query/ -maxdepth 1 -name "*.q" | wc -l) +testedcount=$(find ./query/out/ -maxdepth 1 -name "*.q.out"| wc -l) +successcount=0 +failedcount=0 + +for file in ./query/out/*; do + if [ -f "$file" ]; then + echo "Diff $file with expected/$(basename "$file")" + if diff -a "$file" ./expected/$(basename "$file"); then + successcount=$((successcount+1)) + else + failedcount=$((failedcount+1)) + fi + fi +done + +echo "Total test: $filecount Success: $successcount Failed: $failedcount Skipped: $((filecount-testedcount))" +if [ $successcount -eq $filecount ]; then + exit 0 +else + exit 1 +fi diff --git a/java/hive/test/expected/test_all_primitive_types.q.out b/java/hive/test/expected/test_all_primitive_types.q.out index 1c1aea94..9bb1cf81 100644 --- a/java/hive/test/expected/test_all_primitive_types.q.out +++ b/java/hive/test/expected/test_all_primitive_types.q.out @@ -1 +1 @@ -1,1,42,1,2.0,1.0,hello world1!,hello worl,hello worl,aGVsbG8gd29ybGQ0IQ==,2023-12-31,true,2023-12-31 23:59:59,1235.00 +1,1,42,1,2.0,1.0,hello world1!,hello world2!,hello world3! ,aGVsbG8gd29ybGQ0IQ==,2023-12-31,true,2023-12-31 23:59:59,1235.00 diff --git a/java/hive/test/query/out/test_all_primitive_types.q.out b/java/hive/test/query/out/test_all_primitive_types.q.out deleted file mode 100644 index 1c1aea94..00000000 --- a/java/hive/test/query/out/test_all_primitive_types.q.out +++ /dev/null @@ -1 +0,0 @@ -1,1,42,1,2.0,1.0,hello world1!,hello worl,hello worl,aGVsbG8gd29ybGQ0IQ==,2023-12-31,true,2023-12-31 23:59:59,1235.00 diff --git a/java/hive/test/query/out/test_hive_dynamic_partition.q.out b/java/hive/test/query/out/test_hive_dynamic_partition.q.out deleted file mode 100644 index fdfc4c8f..00000000 --- a/java/hive/test/query/out/test_hive_dynamic_partition.q.out +++ /dev/null @@ -1,3 +0,0 @@ -1,2,1,2017 -1,2,1,2018 -3,4,1,2018 diff --git a/java/hive/test/query/out/test_hive_static_partition.q.out b/java/hive/test/query/out/test_hive_static_partition.q.out deleted file mode 100644 index d1719b1e..00000000 --- a/java/hive/test/query/out/test_hive_static_partition.q.out +++ /dev/null @@ -1,10 +0,0 @@ -1,2,114514 -999,2,666 -999,2,666 -999,2,666 -3,4,666 -999,2,666 -999,2,666 -999,2,666 -3,4,666 -1,2,114514 diff --git a/java/hive/test/query/out/test_insert.q.out b/java/hive/test/query/out/test_insert.q.out deleted file mode 100644 index 03c9c360..00000000 --- a/java/hive/test/query/out/test_insert.q.out +++ /dev/null @@ -1,3 +0,0 @@ -a,1 -b,2 -c,3 diff --git a/java/hive/test/query/out/test_nested_types.q.out b/java/hive/test/query/out/test_nested_types.q.out deleted file mode 100644 index 1b425d3d..00000000 --- a/java/hive/test/query/out/test_nested_types.q.out +++ /dev/null @@ -1 +0,0 @@ -421hello2world! diff --git a/java/hive/test/query/test_all_primitive_types.q b/java/hive/test/query/test_all_primitive_types.q index 48466b11..ecd953c6 100644 --- a/java/hive/test/query/test_all_primitive_types.q +++ b/java/hive/test/query/test_all_primitive_types.q @@ -7,8 +7,8 @@ create table test_all_primitive_types ( field_5 double, field_6 float, field_7 string, - field_9 varchar(10), - field_10 char(10), + field_9 varchar(20), + field_10 char(20), field_8 binary, field_11 date, field_12 boolean, @@ -32,7 +32,7 @@ insert into test_all_primitive_types select timestamp('2023-12-31 23:59:59'), cast(1234.56 as decimal); -insert overwrite directory '/tmp/out/test_all_primitive_types/' +insert overwrite directory 'file:///tmp/out/test_all_primitive_types/' ROW FORMAT DELIMITED FIELDS TERMINATED BY ',' select * from test_all_primitive_types; drop table test_all_primitive_types; \ No newline at end of file diff --git a/java/hive/test/query/test_hive_dynamic_partition.q b/java/hive/test/query/test_hive_dynamic_partition.q index 155458bd..3bf15cd0 100644 --- a/java/hive/test/query/test_hive_dynamic_partition.q +++ b/java/hive/test/query/test_hive_dynamic_partition.q @@ -15,7 +15,7 @@ create table hive_dynamic_partition_test )partitioned by(mounth int, year int); insert into table hive_dynamic_partition_test partition(mounth=1, year) select src_id,dst_id,year from hive_dynamic_partition_data; -insert overwrite directory '/tmp/out/test_hive_dynamic_partition/' +insert overwrite directory 'file:///tmp/out/test_hive_dynamic_partition/' ROW FORMAT DELIMITED FIELDS TERMINATED BY ',' select * from hive_dynamic_partition_test order by src_id asc; diff --git a/java/hive/test/query/test_static_partition.q b/java/hive/test/query/test_hive_static_partition.q similarity index 91% rename from java/hive/test/query/test_static_partition.q rename to java/hive/test/query/test_hive_static_partition.q index 5337ee07..9f3e28bd 100644 --- a/java/hive/test/query/test_static_partition.q +++ b/java/hive/test/query/test_hive_static_partition.q @@ -21,7 +21,7 @@ select * from hive_static_partition where value=666 union all select * from hive_static_partition where value=114514; -insert overwrite directory '/tmp/out/test_hive_static_partition/' +insert overwrite directory 'file:///tmp/out/test_hive_static_partition/' ROW FORMAT DELIMITED FIELDS TERMINATED BY ',' select * from result order by field_1 asc; diff --git a/java/hive/test/query/test_insert.q b/java/hive/test/query/test_insert.q index 0c53f444..029194ad 100644 --- a/java/hive/test/query/test_insert.q +++ b/java/hive/test/query/test_insert.q @@ -3,7 +3,7 @@ create table hive_example(field_1 string,field_2 int); insert into hive_example values('a', 1), ('b', 2), ('c', 3); -insert overwrite directory '/tmp/out/test_insert/' +insert overwrite directory 'file:///tmp/out/test_insert/' row format delimited fields terminated by ',' select * from hive_example order by field_1 asc; diff --git a/java/hive/test/query/test_nested_types.q b/java/hive/test/query/test_nested_types.q index 1366e970..8a0a8c56 100644 --- a/java/hive/test/query/test_nested_types.q +++ b/java/hive/test/query/test_nested_types.q @@ -13,7 +13,7 @@ insert into nested_table select named_struct('field_1', 2, 'field_2', 'world!'))); -insert overwrite directory '/tmp/out/test_nested_types/' +insert overwrite directory 'file:///tmp/out/test_nested_types/' ROW FORMAT DELIMITED FIELDS TERMINATED BY ',' select * from nested_table; drop table nested_table; \ No newline at end of file diff --git a/java/hive/test/spark-hive-distributed-test.sh b/java/hive/test/spark-hive-distributed-test.sh new file mode 100755 index 00000000..a78e5ff8 --- /dev/null +++ b/java/hive/test/spark-hive-distributed-test.sh @@ -0,0 +1,46 @@ +#!/bin/bash + +outdir=./spark-query/spark-out +if [ -d "$outdir" ]; then + rm -r "$outdir" +fi + +docker exec hive-metastore sh -c 'cp /hive-config-distributed/hive-site.xml $SPARK_HOME/conf/' +docker exec hive-metastore sh -c 'cp /spark-config-distributed/* $SPARK_HOME/conf/' + +docker cp ./spark-query hive-metastore:/tmp/ + +for file in ./spark-query/*; do + query=$(basename "$file") + docker exec hive-metastore spark-shell --master yarn -i /tmp/spark-query/"$query" +done + +docker cp hive-metastore:/tmp/spark-out ./spark-query/ +for dir in ./spark-query/spark-out/*; do + cat $dir/part-* > ./spark-query/spark-out/$(basename "$dir").q.out + rm -r $dir +done + +filecount=$(find ./spark-query/ -name "*.scala" | wc -l) +testedcount=$(find ./spark-query/spark-out/ -name "*.out" | wc -l) +successcount=0 +failedcount=0 + +for file in ./spark-query/spark-out/*; do + if [ -f "$file" ]; then + echo "Diff $file with expected/$(basename "$file")" + if diff -a "$file" ./expected/$(basename "$file"); then + successcount=$((successcount+1)) + else + failedcount=$((failedcount+1)) + fi + fi +done + +echo "Total test: $filecount Success: $successcount Failed: $failedcount Skipped: $((filecount-testedcount))" + +if [ $successcount -eq $filecount ]; then + exit 0 +else + exit 1 +fi \ No newline at end of file diff --git a/java/hive/test/spark-hive-test.sh b/java/hive/test/spark-hive-test.sh new file mode 100755 index 00000000..4ce53a0b --- /dev/null +++ b/java/hive/test/spark-hive-test.sh @@ -0,0 +1,46 @@ +#!/bin/bash + +outdir=./spark-query/spark-out +if [ -d "$outdir" ]; then + rm -r "$outdir" +fi + +docker exec spark sh -c 'cp /hive-config/hive-site.xml $SPARK_HOME/conf/' +docker exec spark sh -c 'cp /spark-config/* $SPARK_HOME/conf/' + +docker cp ./spark-query spark:/tmp/ + +for file in ./spark-query/*; do + query=$(basename "$file") + docker exec spark spark-shell -i /tmp/spark-query/"$query" +done + +docker cp spark:/tmp/spark-out ./spark-query/ +for dir in ./spark-query/spark-out/*; do + cat $dir/part-* > ./spark-query/spark-out/$(basename "$dir").q.out + rm -r $dir +done + +filecount=$(find ./spark-query/ -name "*.scala" | wc -l) +testedcount=$(find ./spark-query/spark-out/ -name "*.out" | wc -l) +successcount=0 +failedcount=0 + +for file in ./spark-query/spark-out/*; do + if [ -f "$file" ]; then + echo "Diff $file with expected/$(basename "$file")" + if diff -a "$file" ./expected/$(basename "$file"); then + successcount=$((successcount+1)) + else + failedcount=$((failedcount+1)) + fi + fi +done + +echo "Total test: $filecount Success: $successcount Failed: $failedcount Skipped: $((filecount-testedcount))" + +if [ $successcount -eq $filecount ]; then + exit 0 +else + exit 1 +fi \ No newline at end of file diff --git a/java/hive/test/spark-query/test_all_primitive_types.scala b/java/hive/test/spark-query/test_all_primitive_types.scala new file mode 100644 index 00000000..f7d36f8b --- /dev/null +++ b/java/hive/test/spark-query/test_all_primitive_types.scala @@ -0,0 +1,75 @@ +import org.apache.spark.{SparkConf, SparkContext} +import org.apache.spark.sql.SparkSession + +val conf = new SparkConf() +conf.setAppName("Spark on Vineyard").setMaster("yarn").set("spark.scheduler.minRegisteredResourcesRatio", "1.0") + +val spark = SparkSession.builder().config(conf).enableHiveSupport().getOrCreate() + +spark.sql( + """ + |show tables; + |""".stripMargin).show() + +spark.sql( + """ + |drop table if exists test_all_primitive_types; + |""".stripMargin) + +spark.sql( + """ + | create table test_all_primitive_types ( + | field_1 tinyint, + | field_2 smallint, + | field_3 bigint, + | field_4 int, + | field_5 double, + | field_6 float, + | field_7 string, + | field_9 varchar(20), + | field_10 char(20), + | field_8 binary, + | field_11 date, + | field_12 boolean, + | field_13 timestamp, + | field_14 decimal(6, 2) + | ) + | row format serde "io.v6d.hive.ql.io.VineyardSerDe" + | stored as + | INPUTFORMAT 'io.v6d.hive.ql.io.VineyardInputFormat' + | OUTPUTFORMAT 'io.v6d.hive.ql.io.VineyardOutputFormat' + | LOCATION 'vineyard:///opt/hive/data/warehouse/spark_example' + |""".stripMargin).show() + +spark.sql( + """ + | insert into test_all_primitive_types select + | tinyint(1), + | smallint(1), + | 42, + | bigint(1), + | double(2.0), + | float(1.0), + | 'hello world1!', + | 'hello world2!', + | 'hello world3!', + | cast('hello world4!' as binary), + | date('2023-12-31'), + | true, + | timestamp('2023-12-31 23:59:59'), + | cast(1234.56 as decimal); + |""".stripMargin).show() + +spark.sql( + """ + | insert overwrite directory '/tmp/spark-out/test_all_primitive_types/' + | ROW FORMAT DELIMITED FIELDS TERMINATED BY ',' + | select * from test_all_primitive_types; + |""".stripMargin).show(truncate=false) + +spark.sql( + """ + | drop table test_all_primitive_types; + |""".stripMargin).show(truncate=false) + +System.exit(0)