From 8e72a0c2a911ea888277c38aca2a528058ee1f56 Mon Sep 17 00:00:00 2001 From: JIANXIONG DONG Date: Fri, 11 Feb 2022 11:15:28 -0800 Subject: [PATCH 1/3] update spark dependency in build.sbt --- build.sbt | 31 ++++++++++++++----------------- project/assembly.sbt | 2 +- project/build.properties | 2 +- project/spark_packages.sbt | 3 --- 4 files changed, 16 insertions(+), 22 deletions(-) delete mode 100644 project/spark_packages.sbt diff --git a/build.sbt b/build.sbt index cc4fdea..1ebd2ee 100644 --- a/build.sbt +++ b/build.sbt @@ -1,24 +1,14 @@ name := "spark-tree-plotting" -version := "0.2" +version := "0.3" -scalaVersion := "2.11.12" +scalaVersion := "2.12.10" -libraryDependencies += "net.liftweb" % "lift-json_2.11" % "3.3.0" +libraryDependencies += "net.liftweb" % "lift-json_2.12" % "3.5.0" +libraryDependencies += "org.apache.spark" % "spark-core_2.12" % "3.1.0" -assemblyShadeRules in assembly := Seq( - ShadeRule.rename("net.liftweb.json.**" -> "org.lift.web.library.json.@1").inAll -) - -// Spark Packages config -spName := "julioasotodv/spark-tree-plotting" - -sparkVersion := "2.3.2" - -sparkComponents += "mllib" - -credentials += Credentials(Path.userHome / ".ivy2" / ".sbtcredentials") +libraryDependencies += "org.apache.spark" % "spark-mllib_2.12" % "3.1.0" spShortDescription := "A simple tool for plotting Spark ML's Decision Trees" @@ -32,10 +22,17 @@ licenses += "MIT" -> url("https://opensource.org/licenses/MIT") spIncludeMaven := false - // Resulting name for the assembly jar -assemblyJarName in assembly := "spark-tree-plotting_0.2.jar" +assemblyJarName in assembly := "spark-tree-plotting_0.3.jar" // Do not include the Scala library itself in the jar assemblyOption in assembly := (assemblyOption in assembly).value.copy(includeScala = false) +assemblyMergeStrategy := { + case m if m.toLowerCase.endsWith("manifest.mf") => MergeStrategy.discard + case m if m.toLowerCase.matches("meta-inf.*\\.sf$") => MergeStrategy.discard + case "log4j.properties" => MergeStrategy.discard + case m if m.toLowerCase.startsWith("meta-inf/services/") => MergeStrategy.filterDistinctLines + case "reference.conf" => MergeStrategy.concat + case _ => MergeStrategy.first +} diff --git a/project/assembly.sbt b/project/assembly.sbt index 418b0e1..429a3c6 100644 --- a/project/assembly.sbt +++ b/project/assembly.sbt @@ -1,3 +1,3 @@ resolvers += Resolver.sbtPluginRepo("releases") -addSbtPlugin("com.eed3si9n" %% "sbt-assembly" % "0.14.4") +addSbtPlugin("com.eed3si9n" %% "sbt-assembly" % "1.0.0") diff --git a/project/build.properties b/project/build.properties index 133a8f1..c8fcab5 100644 --- a/project/build.properties +++ b/project/build.properties @@ -1 +1 @@ -sbt.version=0.13.17 +sbt.version=1.6.2 diff --git a/project/spark_packages.sbt b/project/spark_packages.sbt deleted file mode 100644 index db48284..0000000 --- a/project/spark_packages.sbt +++ /dev/null @@ -1,3 +0,0 @@ -resolvers += "bintray-spark-packages" at "https://dl.bintray.com/spark-packages/maven/" - -addSbtPlugin("org.spark-packages" % "sbt-spark-package" % "0.2.6") \ No newline at end of file From 7dfacb8075409b22541b2cbed1de6539af4c27f7 Mon Sep 17 00:00:00 2001 From: JIANXIONG DONG Date: Fri, 11 Feb 2022 13:02:23 -0800 Subject: [PATCH 2/3] Remove spark dependency from assembly --- build.sbt | 20 +++++++++----------- 1 file changed, 9 insertions(+), 11 deletions(-) diff --git a/build.sbt b/build.sbt index 1ebd2ee..cd4e286 100644 --- a/build.sbt +++ b/build.sbt @@ -4,15 +4,15 @@ version := "0.3" scalaVersion := "2.12.10" -libraryDependencies += "net.liftweb" % "lift-json_2.12" % "3.5.0" +libraryDependencies ++= Seq( + "net.liftweb" % "lift-json_2.12" % "3.5.0", + "org.apache.spark" % "spark-core_2.12" % "3.1.0" % "provided", + "org.apache.spark" % "spark-mllib_2.12" % "3.1.0" % "provided" +) -libraryDependencies += "org.apache.spark" % "spark-core_2.12" % "3.1.0" +lazy val spShortDescription = "A simple tool for plotting Spark ML's Decision Trees" -libraryDependencies += "org.apache.spark" % "spark-mllib_2.12" % "3.1.0" - -spShortDescription := "A simple tool for plotting Spark ML's Decision Trees" - -spDescription := """This module provides a simple tool for plotting an easy to understand graphical representation +lazy val spDescription = """This module provides a simple tool for plotting an easy to understand graphical representation |of Spark ML's DecisionTreeClassificationModels, very similar to the one Python's Scikit-Learn provides. |Given a DecisionTreeClassificationModel, spark_tree_plotting generates a JSON file with |the relevant metadata in order to plot the tree. Moreover, a simple JSON-to-DOT python @@ -20,13 +20,11 @@ spDescription := """This module provides a simple tool for plotting an easy to u licenses += "MIT" -> url("https://opensource.org/licenses/MIT") -spIncludeMaven := false - // Resulting name for the assembly jar -assemblyJarName in assembly := "spark-tree-plotting_0.3.jar" +assembly / assemblyJarName := { name.value + "-assembly-" + version.value + ".jar" } // Do not include the Scala library itself in the jar -assemblyOption in assembly := (assemblyOption in assembly).value.copy(includeScala = false) +assembly / assemblyOption := (assembly / assemblyOption).value.withIncludeScala(false) assemblyMergeStrategy := { case m if m.toLowerCase.endsWith("manifest.mf") => MergeStrategy.discard From b3bc0f7a6d2aa977527609b6405472fd1f17693e Mon Sep 17 00:00:00 2001 From: JIANXIONG DONG Date: Fri, 11 Feb 2022 19:25:17 -0800 Subject: [PATCH 3/3] package the python scripts based on standard pip package --- python/spark_tree_plotting/README.md | 9 ++ python/spark_tree_plotting/setup.py | 121 ++++++++++++++++++ .../spark_tree_plotting/__init__.py | 0 .../spark_tree_plotting/__version__.py | 1 + .../spark_tree_plotting.py | 4 +- 5 files changed, 133 insertions(+), 2 deletions(-) create mode 100644 python/spark_tree_plotting/README.md create mode 100644 python/spark_tree_plotting/setup.py create mode 100644 python/spark_tree_plotting/spark_tree_plotting/__init__.py create mode 100644 python/spark_tree_plotting/spark_tree_plotting/__version__.py rename python/{ => spark_tree_plotting/spark_tree_plotting}/spark_tree_plotting.py (99%) diff --git a/python/spark_tree_plotting/README.md b/python/spark_tree_plotting/README.md new file mode 100644 index 0000000..1cdd8cb --- /dev/null +++ b/python/spark_tree_plotting/README.md @@ -0,0 +1,9 @@ +A Simple tool for plotting Spark ML's Decision Trees +-- +If you like to use API, you can install the package + +python setup.py install + +Create wheel package +-- +python setup.py bdist_wheel diff --git a/python/spark_tree_plotting/setup.py b/python/spark_tree_plotting/setup.py new file mode 100644 index 0000000..890bbcc --- /dev/null +++ b/python/spark_tree_plotting/setup.py @@ -0,0 +1,121 @@ +#!/usr/bin/env python +# -*- coding: utf-8 -*- + +# Appreciatively cargo-culted from: https://github.com/kennethreitz/setup.py + +# Note: To use the 'upload' functionality of this file, you must: +# $ pip install twine + +import io +import os +import sys +from shutil import rmtree + +from setuptools import find_packages, setup, Command + +# Package meta-data. +NAME = 'spark_tree_plotting' +DESCRIPTION = 'A simple tool for plotting Spark ML Decision Trees' +URL = 'https://github.com/julioasotodv/spark-tree-plotting' +EMAIL = 'julioasotodv@gmail.com' +AUTHOR = 'Julio Antonio Soto' + +# What packages are required for this module to be executed? +REQUIRED = [ + 'pydot' +] + +# The rest you shouldn't have to touch too much :) +# ------------------------------------------------ +# Except, perhaps the License and Trove Classifiers! +# If you do change the License, remember to change the Trove Classifier for that! + +here = os.path.abspath(os.path.dirname(__file__)) + +# Import the README and use it as the long-description. +# Note: this will only work if 'README.md' is present in your MANIFEST.in file! +with io.open(os.path.join(here, 'README.md'), encoding='utf-8') as f: + long_description = '\n' + f.read() + +# Load the package's __version__.py module as a dictionary. +about = {} +with open(os.path.join(here, NAME, '__version__.py')) as f: + exec(f.read(), about) + + +class UploadCommand(Command): + """Support setup.py upload.""" + + description = 'Build and publish the package.' + user_options = [] + + @staticmethod + def status(s): + """Prints things in bold.""" + print('\033[1m{0}\033[0m'.format(s)) + + def initialize_options(self): + pass + + def finalize_options(self): + pass + + def run(self): + try: + self.status('Removing previous builds…') + rmtree(os.path.join(here, 'dist')) + except OSError: + pass + + self.status('Building Source and Wheel (universal) distribution…') + os.system('{0} setup.py sdist bdist_wheel --universal'.format(sys.executable)) + + self.status('Uploading the package to PyPi via Twine…') + os.system('twine upload dist/*') + + sys.exit() + + +# Where the magic happens: +setup( + name=NAME, + version=about['__version__'], + description=DESCRIPTION, + long_description=long_description, + author=AUTHOR, + author_email=EMAIL, + url=URL, + packages=find_packages(exclude=('tests',)), + # If your package is a single module, use this instead of 'packages': + # py_modules=['mypackage'], + + # entry_points={ + # 'console_scripts': ['mycli=mymodule:cli'], + # }, + + test_suite = 'spark_tree_plotting.tests', + + install_requires=REQUIRED, + include_package_data=True, + license='Apache Software License', + classifiers=[ + # Trove classifiers + # Full list: https://pypi.python.org/pypi?%3Aaction=list_classifiers + 'License :: OSI Approved :: Apache Software License', + 'Programming Language :: Python', + 'Programming Language :: Python :: 2.7', + 'Programming Language :: Python :: 3', + 'Programming Language :: Python :: 3.3', + 'Programming Language :: Python :: 3.4', + 'Programming Language :: Python :: 3.5', + 'Programming Language :: Python :: 3.6', + 'Programming Language :: Python :: Implementation :: CPython', + 'Programming Language :: Python :: Implementation :: PyPy', + 'Development Status :: 3 - Alpha', + 'Topic :: Scientific/Engineering', + ], + # $ setup.py publish support. + cmdclass={ + 'upload': UploadCommand, + }, +) diff --git a/python/spark_tree_plotting/spark_tree_plotting/__init__.py b/python/spark_tree_plotting/spark_tree_plotting/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/python/spark_tree_plotting/spark_tree_plotting/__version__.py b/python/spark_tree_plotting/spark_tree_plotting/__version__.py new file mode 100644 index 0000000..0404d81 --- /dev/null +++ b/python/spark_tree_plotting/spark_tree_plotting/__version__.py @@ -0,0 +1 @@ +__version__ = '0.3.0' diff --git a/python/spark_tree_plotting.py b/python/spark_tree_plotting/spark_tree_plotting/spark_tree_plotting.py similarity index 99% rename from python/spark_tree_plotting.py rename to python/spark_tree_plotting/spark_tree_plotting/spark_tree_plotting.py index 058075a..b51b67b 100644 --- a/python/spark_tree_plotting.py +++ b/python/spark_tree_plotting/spark_tree_plotting/spark_tree_plotting.py @@ -424,7 +424,7 @@ def plot_tree(DecisionTreeClassificationModel, featureNames=None, categoryNames= from pydot import graph_from_dot_data except ImportError: raise ImportError( - "This function requires pydot3 dot be installed. You can easily install it with pip install pydot3" + "This function requires pydot dot be installed. You can easily install it with pip install pydot" ) graph = graph_from_dot_data(export_graphviz(DecisionTreeClassificationModel, @@ -441,4 +441,4 @@ def plot_tree(DecisionTreeClassificationModel, featureNames=None, categoryNames= else: plot = graph.create_png() return plot - \ No newline at end of file +