Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Introduce dataframe to ExperimentData (step1) #1133

Merged
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
27 commits
Select commit Hold shift + click to select a range
6d53507
Add dataframe support for extended equality
nkanazawa1989 Apr 10, 2023
a24d7b4
Replace ExperimentData._analysis_results with dataframe
nkanazawa1989 Apr 10, 2023
65269e9
Upgrade database_service unittests
nkanazawa1989 Apr 10, 2023
323a0d9
Docs update
nkanazawa1989 May 12, 2023
7b5f901
Replace verbosity with explicit column names
nkanazawa1989 Jun 13, 2023
be515f7
Add test for dataframe classes
nkanazawa1989 Jun 14, 2023
4675324
Add code comment for short uuid
nkanazawa1989 Jun 21, 2023
20760f7
Add job running time to ExperimentData
nkanazawa1989 Jun 21, 2023
b84dfa1
Extend AnalysisResultData to be comparable with AnalysisResultTable c…
nkanazawa1989 Jun 21, 2023
f549642
Add type alias for figure data
nkanazawa1989 Jun 21, 2023
3c154d4
Overhaul composite analysis. Simplified sub container initialization.
nkanazawa1989 Jun 21, 2023
c9de11c
Revert "Overhaul composite analysis. Simplified sub container initial…
nkanazawa1989 Jul 2, 2023
a82ef79
Minimum cleanup for composite analysis
nkanazawa1989 Jul 2, 2023
4fc1b6d
Bugfix
nkanazawa1989 Jul 3, 2023
df5557b
Reorganize: move result table class to framework
nkanazawa1989 Jul 3, 2023
cddcaab
Add reno
nkanazawa1989 Jul 3, 2023
b2579d8
fix test
nkanazawa1989 Jul 3, 2023
adfd41e
more threadsafe
nkanazawa1989 Jul 3, 2023
b288d24
Drop multi processing test. Qiskit parallel_map is multiprocess based…
nkanazawa1989 Jul 3, 2023
438d828
Drop __getattr__ from the ThreadSafeDataFrame. This was implemented t…
nkanazawa1989 Jul 3, 2023
c2cd090
Fix analysis save bug
nkanazawa1989 Aug 7, 2023
cc5cf41
Remove redundant timezone conversion and use tzlocal
nkanazawa1989 Aug 7, 2023
b00945b
Fix missing experiment and run_time column in loaded data. These are …
nkanazawa1989 Aug 7, 2023
2cda8dc
Add test for key order preservation and remove redundant code
nkanazawa1989 Aug 7, 2023
665ff8f
Code fix for auto figure name. Innermost experiment creates figure na…
nkanazawa1989 Aug 7, 2023
2bdd09d
Relax the validation for result id. User should be able to use arbitr…
nkanazawa1989 Aug 8, 2023
f6e7294
Update running_time doc
nkanazawa1989 Aug 11, 2023
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
7 changes: 4 additions & 3 deletions qiskit_experiments/database_service/device_component.py
Original file line number Diff line number Diff line change
Expand Up @@ -81,9 +81,10 @@ def to_component(string: str) -> DeviceComponent:
Raises:
ValueError: If input string is not a valid device component.
"""
if isinstance(string, DeviceComponent):
return string
if string.startswith("Q"):
return Qubit(int(string[1:]))
elif string.startswith("R"):
if string.startswith("R"):
return Resonator(int(string[1:]))
else:
return UnknownComponent(string)
return UnknownComponent(string)
200 changes: 199 additions & 1 deletion qiskit_experiments/database_service/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,12 +19,14 @@
from abc import ABC, abstractmethod
from collections import OrderedDict
from datetime import datetime, timezone
from typing import Callable, Tuple, Dict, Any, Union, Type, Optional
from typing import Callable, Tuple, List, Dict, Any, Union, Type, Optional
import json

import pandas as pd
import dateutil.parser
import pkg_resources
from dateutil import tz

from qiskit.version import __version__ as terra_version

from qiskit_ibm_experiment import (
Expand Down Expand Up @@ -276,3 +278,199 @@ def append(self, value):
"""Append to the list."""
with self._lock:
self._container.append(value)


class ThreadSafeDataFrame(ThreadSafeContainer):
"""Thread safe data frame.

This class wraps pandas dataframe with predefined column labels,
which is specified by the class method `_default_columns`.
Subclass can override this method to provide default labels specific to its data structure.

This object is expected to be used internally in the ExperimentData.
"""

def __init__(self, init_values=None):
"""ThreadSafeContainer constructor."""
self._columns = self._default_columns()
self._extra = []
super().__init__(init_values)

@classmethod
def _default_columns(cls) -> List[str]:
return []

def _init_container(self, init_values: Optional[Union[Dict, pd.DataFrame]] = None):
"""Initialize the container."""
if init_values is None:
return pd.DataFrame(columns=self.get_columns())
if isinstance(init_values, pd.DataFrame):
input_columns = list(init_values.columns)
if input_columns != self.get_columns():
raise ValueError(
f"Input data frame contains unexpected columns {input_columns}. "
f"{self.__class__.__name__} defines {self.get_columns()} as default columns."
)
return init_values
if isinstance(init_values, dict):
return pd.DataFrame.from_dict(
data=init_values,
orient="index",
columns=self.get_columns(),
)
raise TypeError(f"Initial value of {type(init_values)} is not valid data type.")

def get_columns(self) -> List[str]:
"""Return current column names.

Returns:
List of column names.
"""
with self._lock:
return self._columns.copy()

def add_columns(self, *new_columns: str, default_value: Any = None):
"""Add new columns to the table.

This operation mutates the current container.

Args:
new_columns: Name of columns to add.
default_value: Default value to fill added columns.
"""
with self._lock:
# Order sensitive
new_columns = [c for c in new_columns if c not in self.get_columns()]

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

If we convert self.get_columns() statement to set from list that will be faster. Also, using set wont be a problem because, property of column names that about uniqueness.

Copy link
Collaborator Author

@nkanazawa1989 nkanazawa1989 Aug 7, 2023

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I agree set is faster but column name is order sensitive -- not only about uniqueness. This is basically designed so that important information must come on the left side on the pandas html table (and I also believe a user assumes the keys are added to the table in the same ordering with the add method call). Unfortunately Python doesn't provide ordered set in builtin (I don't want to add extra dependency without drastic performance gain) and just gave up using the set here.

Here is the result of very casual performance check on my laptop (typically columns size is at most 20, and new keys added by a heavily customized analysis might be ~3):
image

Indeed set operation is faster, but the difference is just less than 1 usec. Since this line is run only when the analysis class generates an extra key, when you run batch of 10 experiments (this is really rare case), the number of method call cannot exceed 10 times. So the difference that user may experience in some heavy setting would be something like few us.

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I added new test case for key order preservation in 2cda8dc

if len(new_columns) == 0:
return

# Update columns
for new_column in new_columns:
self._container.insert(len(self._container.columns), new_column, default_value)
self._columns.extend(new_columns)
self._extra.extend(new_columns)

def clear(self):
"""Remove all elements from this container."""
with self._lock:
self._container = self._init_container()
self._columns = self._default_columns()
self._extra = []

def container(
self,
collapse_extra: bool = True,
) -> pd.DataFrame:
"""Return bare pandas dataframe.

Args:
collapse_extra: Set True to show only default columns.

Returns:
Bare pandas dataframe. This object is no longer thread safe.
"""
with self._lock:
container = self._container.copy()

if collapse_extra:
return container[self._default_columns()]
return container

def drop_entry(
self,
index: str,
):
"""Drop entry from the dataframe.

Args:
index: Name of entry to drop.

Raises:
ValueError: When index is not in this table.
"""
with self._lock:
if index not in self._container.index:

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

As mentioned on my first review you can use set here too. Because indexes must be unique. I think, you can attach set to class blueprint for unique indexes according to my opinion it would not hold much space.

Copy link
Collaborator Author

@nkanazawa1989 nkanazawa1989 Aug 7, 2023

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Indeed this is the pandas Index object and this check is sufficiently performant. Typecast overhead is much expensive.
image

Note that index is not only added, but also being deleted. Adding new instance member of tracking the existing indices may make the code slightly faster, but this will add lines of code in multiple places and it increases maintenance overhead.

raise ValueError(f"Table index {index} doesn't exist in this table.")
self._container.drop(index, inplace=True)

def get_entry(
self,
index: str,
) -> pd.Series:
"""Get entry from the dataframe.

Args:
index: Name of entry to acquire.

Returns:
Pandas Series of acquired entry. This doesn't mutate the table.

Raises:
ValueError: When index is not in this table.
"""
with self._lock:
if index not in self._container.index:
nkanazawa1989 marked this conversation as resolved.
Show resolved Hide resolved
raise ValueError(f"Table index {index} doesn't exist in this table.")

return self._container.loc[index]

def add_entry(
self,
index: str,
**kwargs,
) -> pd.Series:
"""Add new entry to the dataframe.

Args:
index: Name of this entry. Must be unique in this table.
kwargs: Description of new entry to register.

Returns:
Pandas Series of added entry. This doesn't mutate the table.

Raises:
ValueError: When index is not unique in this table.
"""
with self._lock:
if index in self._container.index:
nkanazawa1989 marked this conversation as resolved.
Show resolved Hide resolved
raise ValueError(f"Table index {index} already exists in the table.")

if kwargs.keys() - set(self.get_columns()):
self.add_columns(*kwargs.keys())

template = dict.fromkeys(self.get_columns())
template.update(kwargs)

if not isinstance(index, str):
index = str(index)
self._container.loc[index] = list(template.values())

return self._container.iloc[-1]

def _repr_html_(self) -> Union[str, None]:
"""Return HTML representation of this dataframe."""
with self._lock:
# Remove underscored columns.
return self._container._repr_html_()

def __json_encode__(self) -> Dict[str, Any]:
with self._lock:
return {
"class": "ThreadSafeDataFrame",
"data": self._container.to_dict(orient="index"),
"columns": self._columns,
"extra": self._extra,
}

@classmethod
def __json_decode__(cls, value: Dict[str, Any]) -> "ThreadSafeDataFrame":
if not value.get("class", None) == "ThreadSafeDataFrame":
raise ValueError("JSON decoded value for ThreadSafeDataFrame is not valid class type.")

instance = object.__new__(cls)
# Need to update self._columns first to set extra columns in the dataframe container.
instance._columns = value.get("columns", cls._default_columns())
instance._extra = value.get("extra", [])
instance._lock = threading.RLock()
instance._container = instance._init_container(init_values=value.get("data", {}))
return instance
2 changes: 2 additions & 0 deletions qiskit_experiments/framework/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -86,6 +86,7 @@
AnalysisStatus
AnalysisResult
AnalysisResultData
AnalysisResultTable
ExperimentConfig
AnalysisConfig
ExperimentEncoder
Expand Down Expand Up @@ -137,6 +138,7 @@
from .backend_timing import BackendTiming
from .configs import ExperimentConfig, AnalysisConfig
from .analysis_result_data import AnalysisResultData
from .analysis_result_table import AnalysisResultTable
from .experiment_data import ExperimentData
from .composite import (
ParallelExperiment,
Expand Down
93 changes: 92 additions & 1 deletion qiskit_experiments/framework/analysis_result_data.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,21 +16,80 @@
import logging
from typing import Optional, Dict, Any, List

from qiskit_experiments.database_service.device_component import DeviceComponent


LOG = logging.getLogger(__name__)


@dataclasses.dataclass
class AnalysisResultData:
"""Dataclass for experiment analysis results"""

# TODO: move stderr and unit into custom value class
name: str
value: Any
experiment: str = None
chisq: Optional[float] = None
quality: Optional[str] = None
experiment_id: Optional[str] = None
result_id: Optional[str] = None
tags: List = dataclasses.field(default_factory=list)
backend: Optional[str] = None
run_time: Optional[str] = None
created_time: Optional[str] = None
extra: Dict[str, Any] = dataclasses.field(default_factory=dict, hash=False, compare=False)
device_components: List = dataclasses.field(default_factory=list)

@classmethod
def from_table_element(
cls,
name: str,
value: Any,
experiment: Optional[str] = None,
components: Optional[List[DeviceComponent]] = None,
quality: Optional[str] = None,
experiment_id: Optional[str] = None,
result_id: Optional[str] = None,
tags: Optional[List[str]] = None,
backend: Optional[str] = None,
run_time: Optional[str] = None,
created_time: Optional[str] = None,
**extra,
):
"""A factory method of AnalysisResultData from a single element in AnalysisResultTable.

Args:
name: Name of this entity.
value: Result value.
experiment: Type of experiment.
components: Device component that the experiment was run on.
quality: Quality of this result.
experiment_id: ID of associated experiment.
result_id: Unique ID of this data entry in the storage.
tags: List of tags.
backend: Device name that the experiment was run on.
run_time: A time at the experiment was run.
created_time: A time at this value was computed.
**extra: Extra information.
"""
chisq = extra.pop("chisq", None)

return AnalysisResultData(
name=name,
value=value,
experiment=experiment,
chisq=chisq,
quality=quality,
experiment_id=experiment_id,
result_id=result_id,
tags=tags,
backend=backend,
run_time=run_time,
created_time=created_time,
device_components=components,
extra=extra,
)

def __str__(self):
out = f"{self.name}:"
out += f"\n- value:{self.value}"
Expand All @@ -47,3 +106,35 @@ def __str__(self):
def __iter__(self):
"""Return iterator of data fields (attr, value)"""
return iter((field.name, getattr(self, field.name)) for field in dataclasses.fields(self))


def as_table_element(
result_data: AnalysisResultData,
) -> Dict[str, Any]:
"""Python dataclass as_dict-like function to return
canonical data for analysis AnalysisResultTable.

Args:
result_data: AnalysisResultData dataclass to format.

Returns:
Formatted data representation in dictionary format.
"""
out = {
"name": result_data.name,
"experiment": result_data.experiment,
"components": result_data.device_components,
"value": result_data.value,
"quality": result_data.quality,
"experiment_id": result_data.experiment_id,
"result_id": result_data.result_id,
"tags": result_data.tags,
"backend": result_data.backend,
"run_time": result_data.run_time,
"created_time": result_data.created_time,
}
if result_data.chisq is not None:
out["chisq"] = result_data.chisq
out.update(result_data.extra)

return out
Loading