getredash · guidopetri · Nov 22, 2023 · Nov 1, 2023 · Nov 14, 2023 · Nov 14, 2023
diff --git a/client/app/assets/images/db-logos/yandex_disk.png b/client/app/assets/images/db-logos/yandex_disk.png
diff --git a/redash/query_runner/python.py b/redash/query_runner/python.py
@@ -9,6 +9,7 @@
     guarded_unpack_sequence,
     safe_builtins,
 )
+from RestrictedPython.transformer import IOPERATOR_TO_STR
 
 from redash import models
 from redash.query_runner import (
@@ -23,16 +24,17 @@
     register,
 )
 from redash.utils import json_dumps, json_loads
+from redash.utils.pandas import pandas_installed
 
-try:
-    import numpy as np
+if pandas_installed:
     import pandas as pd
 
-    pandas_installed = True
-except ImportError:
-    pandas_installed = False
+    from redash.utils.pandas import pandas_to_result
+
+    enabled = True
+else:
+    enabled = False
 
-from RestrictedPython.transformer import IOPERATOR_TO_STR
 
 logger = logging.getLogger(__name__)
 
@@ -271,26 +273,11 @@
         return query.latest_query_data.data
 
     def dataframe_to_result(self, result, df):
-        result["rows"] = df.to_dict("records")
-
-        for column_name, column_type in df.dtypes.items():
-            if column_type == np.bool_:
-                redash_type = TYPE_BOOLEAN
-            elif column_type == np.inexact:
-                redash_type = TYPE_FLOAT
-            elif column_type == np.integer:
-                redash_type = TYPE_INTEGER
-            elif column_type in (np.datetime64, np.dtype("<M8[ns]")):
-                if df.empty:
-                    redash_type = TYPE_DATETIME
-                elif len(df[column_name].head(1).astype(str).loc[0]) > 10:
-                    redash_type = TYPE_DATETIME
-                else:
-                    redash_type = TYPE_DATE
-            else:
-                redash_type = TYPE_STRING
+        converted_result = pandas_to_result(df)
 
-            self.add_result_column(result, column_name, column_name, redash_type)
+        result["rows"] = converted_result["rows"]
+        for column in converted_result["columns"]:
+            self.add_result_column(result, column["name"], column["friendly_name"], column["type"])
 
     def get_current_user(self):
         return self._current_user.to_dict()

diff --git a/redash/query_runner/yandex_disk.py b/redash/query_runner/yandex_disk.py
@@ -0,0 +1,166 @@
+import logging
+from importlib.util import find_spec
+
+import requests
+import yaml
+
+from redash.query_runner import BaseSQLQueryRunner, register
+from redash.utils import json_dumps
+from redash.utils.pandas import pandas_installed
+
+openpyxl_installed = find_spec("openpyxl")
+
+if pandas_installed and openpyxl_installed:
+    import openpyxl  # noqa: F401
+    import pandas as pd
+
+    from redash.utils.pandas import pandas_to_result
+
+    enabled = True
+
+    EXTENSIONS_READERS = {
+        "csv": pd.read_csv,
+        "tsv": pd.read_table,
+        "xls": pd.read_excel,
+        "xlsx": pd.read_excel,
+    }
+else:
+    enabled = False
+
+logger = logging.getLogger(__name__)
+
+
+class YandexDisk(BaseSQLQueryRunner):
+    should_annotate_query = False
+
+    @classmethod
+    def type(cls):
+        return "yandex_disk"
+
+    @classmethod
+    def name(cls):
+        return "Yandex Disk"
+
+    @classmethod
+    def configuration_schema(cls):
+        return {
+            "type": "object",
+            "properties": {
+                "token": {"type": "string", "title": "OAuth Token"},
+            },
+            "secret": ["token"],
+            "required": ["token"],
+        }
+
+    def __init__(self, configuration):
+        super(YandexDisk, self).__init__(configuration)
+        self.syntax = "yaml"
+        self.base_url = "https://cloud-api.yandex.net/v1/disk"
+        self.list_path = "counters"
+
+    def _get_tables(self, schema):
+        offset = 0
+        limit = 100
+
+        while True:
+            tmp_response = self._send_query(
+                "resources/public", media_type="spreadsheet,text", limit=limit, offset=offset
+            )
+
+            tmp_items = tmp_response["items"]
+
+            for file_info in tmp_items:
+                file_name = file_info["name"]
+                file_path = file_info["path"].replace("disk:", "")
+
+                file_extension = file_name.split(".")[-1].lower()
+                if file_extension not in EXTENSIONS_READERS:
+                    continue
+
+                schema[file_name] = {"name": file_name, "columns": [file_path]}
+
+            if len(tmp_items) < limit:
+                break
+
+            offset += limit
+
+        return list(schema.values())
+
+    def test_connection(self):
+        self._send_query()
+
+    def _send_query(self, url_path="", **kwargs):
+        token = kwargs.pop("oauth_token", self.configuration["token"])
+        r = requests.get(
+            f"{self.base_url}/{url_path}",
+            headers={"Authorization": f"OAuth {token}"},
+            params=kwargs,
+        )
+
+        response_data = r.json()
+
+        if not r.ok:
+            error_message = f"Code: {r.status_code}, message: {r.text}"
+            raise Exception(error_message)
+        return response_data
+
+    def run_query(self, query, user):
+        logger.debug("Yandex Disk is about to execute query: %s", query)
+        data = None
+
+        if not query:
+            error = "Query is empty"
+            return data, error
+
+        try:
+            params = yaml.safe_load(query)
+        except (ValueError, AttributeError) as e:
+            logger.exception(e)
+            error = f"YAML read error: {str(e)}"
+            return data, error
+
+        if not isinstance(params, dict):
+            error = "The query format must be JSON or YAML"
+            return data, error
+
+        if "path" not in params:
+            error = "The query must contain path"
+            return data, error
+
+        file_extension = params["path"].split(".")[-1].lower()
+
+        read_params = {}
+        is_multiple_sheets = False
+
+        if file_extension not in EXTENSIONS_READERS:
+            error = f"Unsupported file extension: {file_extension}"
+            return data, error
+        elif file_extension in ("xls", "xlsx"):
+            read_params["sheet_name"] = params.get("sheet_name", 0)
+            if read_params["sheet_name"] is None:
+                is_multiple_sheets = True
+
+        file_url = self._send_query("resources/download", path=params["path"])["href"]
+
+        try:
+            df = EXTENSIONS_READERS[file_extension](file_url, **read_params)
+        except Exception as e:
+            logger.exception(e)
+            error = f"Read file error: {str(e)}"
+            return data, error
+
+        if is_multiple_sheets:
+            new_df = []
+            for sheet_name, sheet_df in df.items():
+                sheet_df["sheet_name"] = sheet_name
+                new_df.append(sheet_df)
+            new_df = pd.concat(new_df, ignore_index=True)
+            df = new_df.copy()
+
+        data = json_dumps(pandas_to_result(df))
+        error = None
+
+        return data, error
+
+
+register(YandexDisk)
diff --git a/redash/settings/__init__.py b/redash/settings/__init__.py
@@ -298,6 +298,7 @@ def email_server_is_configured():
     "redash.query_runner.clickhouse",
     "redash.query_runner.tinybird",
     "redash.query_runner.yandex_metrica",
+    "redash.query_runner.yandex_disk",
     "redash.query_runner.rockset",
     "redash.query_runner.treasuredata",
     "redash.query_runner.sqlite",

diff --git a/redash/utils/pandas.py b/redash/utils/pandas.py
@@ -0,0 +1,47 @@
+import logging
+from importlib.util import find_spec
+
+from redash.query_runner import (
+    TYPE_BOOLEAN,
+    TYPE_DATE,
+    TYPE_DATETIME,
+    TYPE_FLOAT,
+    TYPE_INTEGER,
+    TYPE_STRING,
+)
+
+logger = logging.getLogger(__name__)
+
+pandas_installed = find_spec("pandas") and find_spec("numpy")
+
+if pandas_installed:
+    import numpy as np
+    import pandas as pd
+
+    def get_column_types_from_dataframe(df: pd.DataFrame) -> list:
+        columns = []
+        for column_name, column_type in df.dtypes.items():
+            if column_type in (np.bool_,):
+                redash_type = TYPE_BOOLEAN
+            elif column_type in (np.int64, np.int32):
+                redash_type = TYPE_INTEGER
+            elif column_type in (np.float64,):
+                redash_type = TYPE_FLOAT
+            elif column_type in (np.datetime64, np.dtype("<M8[ns]")):
+                if df.empty:
+                    redash_type = TYPE_DATETIME
+                elif len(df[column_name].head(1).astype(str).loc[0]) > 10:
+                    redash_type = TYPE_DATETIME
+                else:
+                    redash_type = TYPE_DATE
+            else:
+                redash_type = TYPE_STRING
+
+            columns.append({"name": column_name, "friendly_name": column_name, "type": redash_type})
+
+        return columns
+
+    def pandas_to_result(df: pd.DataFrame) -> dict:
+        columns = get_column_types_from_dataframe(df)
+        rows = df.to_dict("records")
+        return {"columns": columns, "rows": rows}