Skip to content

Commit

Permalink
use hfs.ls instead of hl.hadoop_stat
Browse files Browse the repository at this point in the history
  • Loading branch information
bw2 committed Jun 30, 2024
1 parent d2f77bc commit 4574f4a
Showing 1 changed file with 9 additions and 8 deletions.
17 changes: 9 additions & 8 deletions step_pipeline/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,15 +4,16 @@
from datetime import datetime, timezone
from dateutil import parser
import glob
import hail as hl
#import hail as hl
import hailtop.fs as hfs
import os
import pytz
import subprocess
import tempfile


os.environ["HAIL_LOG_DIR"] = tempfile.gettempdir()
hl.init(log="/dev/null", quiet=True, idempotent=True)
#os.environ["HAIL_LOG_DIR"] = tempfile.gettempdir()
#hl.init(log="/dev/null", quiet=True, idempotent=True)

GOOGLE_STORAGE_CLIENT = None
PATH_EXISTS_CACHE = {}
Expand Down Expand Up @@ -53,7 +54,7 @@ def _get_google_storage_client(gcloud_project):
def _generate_gs_path_to_file_stat_dict(gs_path_with_wildcards):
"""Takes a gs:// path that contains one or more wildcards ("*") and runs "gsutil ls -l {gs_path_with_wildcards}".
This method then returns a dictionary that maps each gs:// file to its size in bytes. Running gsutil is currently
faster than running hl.hadoop_ls(..) when the path matches many files.
faster than running hfs.ls(..) when the path matches many files.
"""
if not isinstance(gs_path_with_wildcards, str):
raise ValueError(f"Unexpected argument type {str(type(gs_path_with_wildcards))}: {gs_path_with_wildcards}")
Expand Down Expand Up @@ -129,7 +130,7 @@ def _path_exists__cached(path, only_check_the_cache=False, verbose=False):
for path_without_star in path_dict:
PATH_EXISTS_CACHE[path_without_star] = True
else:
PATH_EXISTS_CACHE[path] = hl.hadoop_exists(path)
PATH_EXISTS_CACHE[path] = hfs.exists(path)
else:
if "*" in path:
path_dict = glob.glob(path)
Expand Down Expand Up @@ -188,15 +189,15 @@ def _file_stat__cached(path, only_check_the_cache=False, verbose=False):
PATH_EXISTS_CACHE[path_without_star] = True
else:
try:
stat_results = hl.hadoop_stat(path)
stat_results = hfs.ls(path)
except Exception as e:
if "File not found" in str(e):
raise FileNotFoundError(f"File not found: {path}")
else:
raise e


"""hl.hadoop_stat returns:
"""hfs.ls returns:
{
'path': 'gs://bucket/dir/file.bam.bai',
'size_bytes': 2784,
Expand All @@ -219,7 +220,7 @@ def _file_stat__cached(path, only_check_the_cache=False, verbose=False):
except Exception as e:
raise Exception(f"Unable to parse 'modification_time' from {stat_results}: {e}")
elif stat_results["modification_time"] == None:
raise GoogleStorageException(f"hl.stat returned modification_time == None for {path}")
raise GoogleStorageException(f"hfs.ls returned modification_time == None for {path}")
else:
raise GoogleStorageException(f"Unexpected modification_time type: {type(stat_results['modification_time'])} in {stat_results}")

Expand Down

0 comments on commit 4574f4a

Please sign in to comment.