Skip to content

Commit

Permalink
Remove support for importing individual files. (#266)
Browse files Browse the repository at this point in the history
  • Loading branch information
keyurva authored Dec 16, 2023
1 parent 186af93 commit 4e4831b
Show file tree
Hide file tree
Showing 5 changed files with 28 additions and 50 deletions.
2 changes: 1 addition & 1 deletion run_test.sh
Original file line number Diff line number Diff line change
Expand Up @@ -103,7 +103,7 @@ function run_sample {
rm -f sample/output/datacommons.db

echo "Running sample."
python3 -m stats.main --input_path=sample/input --output_dir=sample/output --freeze_time
python3 -m stats.main --input_dir=sample/input --output_dir=sample/output --freeze_time

echo "Writing tables to CSVs."
mkdir -p sample/output/debug
Expand Down
4 changes: 2 additions & 2 deletions simple/sample/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -5,14 +5,14 @@ This folder contains sample input files under the `input` folder and the generat
To run import on the sample input files, `cd` into the `simple` folder and then run:

```bash
python3 -m stats.main --input_path=sample/input --output_dir=sample/output
python3 -m stats.main --input_dir=sample/input --output_dir=sample/output
```

If you are planning to checkin the sample outputs to github,
consider running with the `--freeze_time` flag.
This freezes the reporting time to `2023-01-01` and avoids updates to `report.json` on every run.

```bash
python3 -m stats.main --input_path=sample/input --output_dir=sample/output --freeze_time
python3 -m stats.main --input_dir=sample/input --output_dir=sample/output --freeze_time
```

2 changes: 1 addition & 1 deletion simple/stats/constants.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,7 @@

# Defaults.
DEFAULT_DATA_DIR = ".data"
DEFAULT_INPUT_PATH = os.path.join(DEFAULT_DATA_DIR, "input")
DEFAULT_INPUT_DIR = os.path.join(DEFAULT_DATA_DIR, "input")
DEFAULT_OUTPUT_DIR = os.path.join(DEFAULT_DATA_DIR, "output")
DEFAULT_FROZEN_TIME = "2023-01-01"

Expand Down
14 changes: 3 additions & 11 deletions simple/stats/main.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,16 +23,10 @@

FLAGS = flags.FLAGS

flags.DEFINE_string(
"entity_type",
None,
"The type of entities in the CSV (e.g. 'City', 'Country', 'Company', etc.).",
)
flags.DEFINE_string("input_path", constants.DEFAULT_INPUT_PATH,
"The input directory or file.")
flags.DEFINE_string("input_dir", constants.DEFAULT_INPUT_DIR,
"The input directory.")
flags.DEFINE_string("output_dir", constants.DEFAULT_OUTPUT_DIR,
"The output directory.")
flags.DEFINE_list("ignore_columns", [], "List of input columns to be ignored.")
flags.DEFINE_bool(
"freeze_time",
False,
Expand Down Expand Up @@ -60,10 +54,8 @@ def _init_logger():

def _run():
Runner(
input_path=FLAGS.input_path,
input_dir=FLAGS.input_dir,
output_dir=FLAGS.output_dir,
entity_type=FLAGS.entity_type,
ignore_columns=FLAGS.ignore_columns,
).run()


Expand Down
56 changes: 21 additions & 35 deletions simple/stats/runner.py
Original file line number Diff line number Diff line change
Expand Up @@ -35,14 +35,13 @@ class Runner:
"""Runs and coordinates all imports.
"""

def __init__(
self,
input_path: str,
output_dir: str,
entity_type: str = None,
ignore_columns: list[str] = list(),
) -> None:
self.input_fh = create_file_handler(input_path)
def __init__(self, input_dir: str, output_dir: str) -> None:
self.input_dir_fh = create_file_handler(input_dir)
if not self.input_dir_fh.isdir:
raise NotADirectoryError(
f"Input path must be a directory: {input_dir}. If it is a GCS path, ensure it ends with a '/'."
)

self.output_dir_fh = create_file_handler(output_dir)
self.nl_dir_fh = self.output_dir_fh.make_file(f"{constants.NL_DIR_NAME}/")
self.process_dir_fh = self.output_dir_fh.make_file(
Expand All @@ -54,16 +53,11 @@ def __init__(

self.reporter = ImportReporter(report_fh=self.process_dir_fh.make_file(
constants.REPORT_JSON_FILE_NAME))
self.entity_type = entity_type
self.ignore_columns = ignore_columns

self.config = Config(data={})
if self.input_fh.isdir:
config_fh = self.input_fh.make_file(constants.CONFIG_JSON_FILE_NAME)
if not config_fh.exists():
raise FileNotFoundError(
"Config file must be provided for importing directories.")
self.config = Config(data=json.loads(config_fh.read_string()))
config_fh = self.input_dir_fh.make_file(constants.CONFIG_JSON_FILE_NAME)
if not config_fh.exists():
raise FileNotFoundError("Config file must be provided.")
self.config = Config(data=json.loads(config_fh.read_string()))

def _get_db_config() -> dict:
# Attempt to get from env (cloud sql, then sqlite),
Expand Down Expand Up @@ -116,24 +110,16 @@ def run(self):
self.reporter.report_failure(error=str(e))

def _run_imports(self):
if not self.input_fh.isdir:
self.reporter.report_started(import_files=[self.input_fh.basename()])
self._run_single_import(input_file_fh=self.input_fh,
reporter=self.reporter.import_file(
self.input_fh.basename()),
entity_type=self.entity_type,
ignore_columns=self.ignore_columns)
else:
input_files = sorted(self.input_fh.list_files(extension=".csv"))
self.reporter.report_started(import_files=input_files)
if not input_files:
raise RuntimeError("Not input CSVs found.")
for input_file in input_files:
self._run_single_import(
input_file_fh=self.input_fh.make_file(input_file),
reporter=self.reporter.import_file(input_file),
entity_type=self.config.entity_type(input_file),
ignore_columns=self.config.ignore_columns(input_file))
input_files = sorted(self.input_dir_fh.list_files(extension=".csv"))
self.reporter.report_started(import_files=input_files)
if not input_files:
raise RuntimeError("Not input CSVs found.")
for input_file in input_files:
self._run_single_import(
input_file_fh=self.input_dir_fh.make_file(input_file),
reporter=self.reporter.import_file(input_file),
entity_type=self.config.entity_type(input_file),
ignore_columns=self.config.ignore_columns(input_file))

def _run_single_import(self,
input_file_fh: FileHandler,
Expand Down

0 comments on commit 4e4831b

Please sign in to comment.