Skip to content

Commit

Permalink
min_size_per_dim + changes for scalars inputs+outputs support (#354)
Browse files Browse the repository at this point in the history
* multiprocessing fix and helper func

* path utilities and also moshikos change to solve hang on tests

* PR comments implemented

* path utils

* changed default deepdiff behavior to ignore nans in comparison, added keys() items() and values() to our NDict, and tried to highlight more the faulting op in pipeline ops error

* solved static code analysis raised issues

* removed unreachable code in paths.py

* * Added "remove_extension" to path utils
* Changed default deepdiff behavior to ignore nans in comparison,
* Added keys() items() and values() to our NDict (until now it returned empty iterables for those which is incorrect)
* Tried to highlight more the faulting op in pipeline ops error

* fixed a bug in head_1D_classifier

* added a lightweight mode of DatasetDefault that doesn't hold any sample_ids. fixed a typo in samplers.py and added a describe method to NDict

* fixing statically detected issues

* added simple function caching utility

* lite weight dataset default

* fixed static checkers

* fixed static code analysis related stuff

* code cleanup

* removed comments

* implemented PR comments

* added hints and better error messages for common mistakes when providing pipeline ops list

* linters etc.

* activation checkpointing

* ...

* activation checkpointing

* removed unneeded files

* added ability to ignore kwargs in function string descriptor building logic, and solved an issue in run_multiprocessed in verbose=0 case

* ...

* added support for maxtasksperchild in run_multiprocessed

* static code analysis based fixes

* ...

* multiprocessing related improvements

* added a utility to get available cpu cores num

* added simple but useful helper function to add prefix to file basename

* static code fixes

* fixed info message on num_available_cores

* num_available_cores works correctly now in distributed LSF/CCC setting as well now

* static code checkers fixes

* ...

* added a common op for replacing entities

* static code checkers

* ...

* removed too user specific dir paths from hash input string

* address tuple in colate

* shared memory utility helping to speed up processing significantly in cases that enough RAM is available

* typo fix

* added a check for available total memory

* ...

* ...

* ...

* ...

* ...

* storing with full path

* storing with full path

* storing with full path

* collate now supports minimal dimension + added more support in samples cacher

* collate now supports minimal dimension + added more support in samples cacher

* PR comments

* ...

* added unit tests :)

* unit tests

* unit tests

* ...

* interface validator

* PR comments

* PR comments

* fixing tests

* fixing tests

* min_size_per_dim

* allowing passing min_size_per_dim which is longer (end will be ignored)

* fixed exception logic

* allowing None to be ignored in casting ops

* added support in perason correlation to samples of different sizes

* unittest shows that now perason correlation fails when the input is a list of raw floats, solved it

* PR comments

* to be able to support cases that a loss function return 0.0 in loss accumulation, and also to make pearson correlation calculation to support the case of empty inputs

---------

Co-authored-by: Yoel Shoshan <[email protected]>
Co-authored-by: Moshiko Raboh <[email protected]>
Co-authored-by: Michal Ozery-Flato <[email protected]>
Co-authored-by: [email protected] <[email protected]>
  • Loading branch information
5 people authored Jul 26, 2024
1 parent d690708 commit 27a89a7
Show file tree
Hide file tree
Showing 4 changed files with 32 additions and 7 deletions.
6 changes: 5 additions & 1 deletion fuse/data/ops/ops_cast.py
Original file line number Diff line number Diff line change
Expand Up @@ -37,12 +37,15 @@ def to_tensor(
value: Any,
dtype: Optional[torch.dtype] = None,
device: Optional[torch.device] = None,
ignore_none: bool = False,
) -> Tensor:
"""
Convert many types to tensor
"""
if isinstance(value, torch.Tensor) and dtype is None and device is None:
pass # do nothing
elif ignore_none and (value is None):
pass # do nothing
elif isinstance(value, (torch.Tensor)):
value = value.to(dtype=dtype, device=device)
elif isinstance(value, (np.ndarray, int, float, list)):
Expand Down Expand Up @@ -213,8 +216,9 @@ def _cast(
value: Any,
dtype: Optional[torch.dtype] = None,
device: Optional[torch.device] = None,
ignore_none: bool = False,
) -> Tensor:
return Cast.to_tensor(value, dtype, device)
return Cast.to_tensor(value, dtype, device, ignore_none=ignore_none)


class OpToNumpy(OpCast):
Expand Down
5 changes: 4 additions & 1 deletion fuse/data/utils/collates.py
Original file line number Diff line number Diff line change
Expand Up @@ -192,7 +192,10 @@ def pad_all_tensors_to_same_size(

if min_size_per_dim is not None:
assert isinstance(min_size_per_dim, tuple)
assert len(min_size_per_dim) == len(max_per_dim)
if len(min_size_per_dim) < len(max_per_dim):
raise Exception(
f" length for min_size_per_dim={min_size_per_dim} expected to be >= max_per_dim={max_per_dim} but found length {len(min_size_per_dim)} and {len(max_per_dim)} respectively ! "
)
assert all(
[(x > 0) or (x == -1) for x in min_size_per_dim]
), "allowed values for elements in min_size_per_dim are only positive integer or -1"
Expand Down
10 changes: 8 additions & 2 deletions fuse/dl/lightning/pl_funcs.py
Original file line number Diff line number Diff line change
Expand Up @@ -179,15 +179,21 @@ def step_losses(
for loss_name, loss_function in losses.items():
current_loss_result = loss_function(batch_dict)
if not optimize:
batch_dict["losses." + loss_name] = current_loss_result.data.item()
batch_dict["losses." + loss_name] = (
current_loss_result.data.item()
if torch.is_tensor(current_loss_result)
else current_loss_result
)
# sum all losses for backward
if total_loss is None:
total_loss = current_loss_result
else:
total_loss += current_loss_result

if total_loss is not None and not optimize:
batch_dict["losses.total_loss"] = total_loss.data.item()
batch_dict["losses.total_loss"] = (
total_loss.data.item() if torch.is_tensor(total_loss) else total_loss
)

return total_loss

Expand Down
18 changes: 15 additions & 3 deletions fuse/eval/metrics/libs/stat.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,12 +20,24 @@ def pearson_correlation(
:param target: target values
:param mask: optional boolean mask. if it is provided, the metric will be applied only to the masked samples
"""
if 0 == len(pred):
return dict(statistic=float("nan"), p_value=float("nan"))

if isinstance(pred, Sequence):
pred = np.array(pred)
if np.isscalar(pred[0]):
pred = np.array(pred)
else:
pred = np.concatenate(pred)
if isinstance(target, Sequence):
target = np.array(target)
if np.isscalar(target[0]):
target = np.array(target)
else:
target = np.concatenate(target)
if isinstance(mask, Sequence):
mask = np.array(mask).astype("bool")
if np.isscalar(mask[0]):
mask = np.array(mask).astype("bool")
else:
mask = np.concatenate(mask).astype("bool")
if mask is not None:
pred = pred[mask]
target = target[mask]
Expand Down

0 comments on commit 27a89a7

Please sign in to comment.