dottxt-ai · cpfiffer · Oct 23, 2024 · Oct 24, 2024 · Oct 24, 2024 · Oct 24, 2024
diff --git a/.gitignore b/.gitignore
@@ -1,7 +1,3 @@
+*/.pyron/*
 */__pycache__/
-lore-generator/milvusdemo.db
-lore-generator/.env
-lore-generator/slides_files/
-*.db
-*.html
-*.lock
+*.env
diff --git a/earnings-transcripts/README.md b/earnings-transcripts/README.md
@@ -0,0 +1,175 @@
+# Earnings call analysis
+
+This project uses outlines to extract structured data from earnings call transcripts. Specifying the information you wish to extract is done by specifying a Pydantic model.
+
+## Overview
+
+Many public companies hold earnings calls where they discuss their financial performance and outlook. These calls are transcribed and made available online. Earnings calls are a valuable source of information about a company's financial health and prospects, but are unfortunately not structured data. It can require manual effort to extract the information of interest.
+
+This project uses outlines to extract structured data from earnings call transcripts. All we need to do is specify the information we wish to extract in a Pydantic model, and the language model will extract the data for us.
+
+Data extracted can be hard numbers, such as revenue or net income, or it can be qualitative, such as earnings sentiment. We can even have the model reason about macroeconomic risks the company may face in future quarters.
+
+WARNING! Do not use this as financial advice. This is a proof of concept, and the
+output should be verified by a human. Analyzing financial data is extremely difficult
+and requires a thorough understanding of the company, the industry, and the
+overall economy. Please consult a financial professional before making investment
+decisions.
+
+## Usage
+
+This can be run locally (using `transcripts_local.py`) or on [Modal](https://modal.com/) using `transcripts_modal.py`. Modal is a cloud platform supporting GPUs.
+
+### Local
+
+1. Install the requirements:
+    ```bash
+    pip install -r requirements.txt
+    ```
+2. Run the script:
+    ```bash
+    python transcripts_local.py
+    ```
+
+### Modal
+
+1. [Sign up](https://modal.com/signup) for a Modal account.
+2. Install the Modal CLI:
+    ```bash
+    pip install modal
+    ```
+2. Set up your Modal key if you have not done so:
+    ```bash
+    modal setup
+    ```
+3. Run the script:
+    ```bash
+    modal run transcripts_modal.py
+    ```
+
+1. Download the [data source](https://www.kaggle.com/datasets/tpotterer/motley-fool-scraped-earnings-call-transcripts)
+2. Run `unpickle-earnings.py` to extract the transcrips to the `transcripts/` directory. This step adds metadata to each transcript, such as company name, ticker, date, quarter, and the transcript itself.
+
+## How it works
+
+We define a Pydantic model that specifies the information we wish to extract from the transcripts. This model is passed to the language model, which then extracts the data according to the schema.
+
+The schema is defined in `transcripts_common.py`. The default class is `EarningsCall`, which extracts
+
+- Company name and ticker
+- Earnings call date and quarter
+- Key takeaways from the call, a list of natural-language highlights
+- An understanding of the financial metrics mentioned in the call
+- Extracted financial metrics in a `FinancialMetrics` object
+- Earnings sentiment, whether the call conveyed generally positive, neutral, or negative information about the company
+- A detailed analysis of various risks the company faces
+    - Macroeconomic risks
+    - Financial risks
+    - Operational risks
+    - Strategic risks
+- An investment recommendation, whether to buy, hold, or sell the company
+- Review correctness, a self-critique by the language model of the extracted data.
+- Whether data needs correction. The model will review the output and note any
+  issues it finds. This is useful to identify if the model made up numbers or
+  misinterpreted the data.
+
+```python
+class Sentiment(str, Enum):
+    """
+    Sentiment of the earnings call.
+    """
+    POSITIVE = "positive"
+    NEUTRAL = "neutral"
+    NEGATIVE = "negative"
+
+class InvestmentRecommendation(str, Enum):
+    """
+    Recommendation of whether to buy, hold, or sell the company's stock.
+    """
+    BUY = "buy"
+    HOLD = "hold"
+    SELL = "sell"
+
+class FinancialMetrics(BaseModel):
+    """
+    Financial metrics mentioned in the earnings call. This can be
+    extended to include other financial metrics as needed -- just
+    add them to the schema.
+
+    We use Optional[thing] for all financial metrics because not all
+    earnings calls mention all of these metrics, and forcing the model
+    to include them when they do not exist will force the model to
+    make numbers up.
+
+    It's useful to also specify units in the schema, otherwise the model
+    may use the units specified in the data. These can vary across companies.
+    """
+    revenue_in_millions: Optional[float] = Field(description="Quarterly revenue in millions of dollars")
+    revenue_growth_in_percent: Optional[float] = Field(description="Revenue growth by quarter in percent")
+    net_income_in_millions: Optional[float] = Field(description="Quarterly net income in millions of dollars")
+    earnings_per_share: Optional[float] = Field(description="Quarterly earnings per share in dollars")
+    ebitda_in_millions: Optional[float] = Field(description="Quarterly EBITDA in millions of dollars")
+    free_cash_flow_in_millions: Optional[float] = Field(description="Quarterly free cash flow in millions of dollars")
+
+class EarningsCall(BaseModel):
+    """
+    The main schema for the earnings call analysis. Using outlines to generate
+    this schema will extract all the information we request from an earnings
+    call transcript.
+
+    To add any new information to the schema, just add a new field to this class
+    (or any child classes, like FinancialMetrics).
+    """
+    company_name: str
+    company_ticker: str
+    earnings_call_date: str
+    earnings_call_quarter: str
+    key_takeaways: List[str]
+
+    # Financial metrics
+    understanding_of_financial_metrics: str
+    financial_metrics: FinancialMetrics
+
+    # Earnings sentiment
+    earnings_sentiment: Sentiment
+
+    # Analysis of various risks
+    macroeconomic_risk_reasoning: str
+    financial_risk_reasoning: str
+    operational_risk_reasoning: str
+    strategic_risk_reasoning: str
+
+    # Whether the analyst's prediction is a buy, hold, or sell
+    investment_recommendation: InvestmentRecommendation
+
+    # Have the model review its own output for correctness
+    review_correctness: List[str]
+
+    # Whether the text must be reprocessed
+    text_must_be_reprocessed: bool
+```
+
+Financial metrics extracted are
+
+- Revenue
+- Revenue growth
+- Net income
+- Earnings per share
+- EBITDA
+- Free cash flow
+
+and can easily be extended to include other financial metrics. In the event that you expand the schema to include other financial metrics, you will need to update the prompt to ensure that the model understands the new metrics it needs to extract.
+
+## Limitations
+
+- The model is not perfect. It will sometimes make up numbers or misinterpret the data.
+- The model may not understand the data it is extracting if it is not mentioned in the transcripts. For example, if a company announces a new product, but does not discuss its financial impact, the model will not be able to extract that information.
+
+## Future work
+
+- Add a second agent to review the output of the first agent. The `text_must_be_reprocessed` field is currently not used, but could be used to trigger a second agent to attempt a reprocessing of the text.
+
+## Contributing
+
+We welcome contributions to the project! Please open an issue or submit a pull request.
+
diff --git a/earnings-transcripts/requirements.txt b/earnings-transcripts/requirements.txt
@@ -0,0 +1,98 @@
+accelerate==1.0.1
+aiohappyeyeballs==2.4.3
+aiohttp==3.10.10
+aiosignal==1.3.1
+aiostream==0.5.2
+airportsdata==20241001
+annotated-types==0.7.0
+anyio==4.6.2.post1
+async-timeout==4.0.3
+attrs==24.2.0
+certifi==2024.8.30
+charset-normalizer==3.4.0
+click==8.1.7
+cloudpickle==3.1.0
+datasets==3.0.2
+dill==0.3.8
+diskcache==5.6.3
+exceptiongroup==1.2.2
+fastapi==0.115.3
+filelock==3.16.1
+frozenlist==1.5.0
+fsspec==2024.9.0
+grpclib==0.4.7
+h2==4.1.0
+hpack==4.0.0
+huggingface-hub==0.26.1
+hyperframe==6.0.1
+idna==3.10
+interegular==0.3.3
+Jinja2==3.1.4
+jsonschema==4.23.0
+jsonschema-specifications==2024.10.1
+lark==1.2.2
+markdown-it-py==3.0.0
+MarkupSafe==3.0.2
+mdurl==0.1.2
+modal==0.64.224
+mpmath==1.3.0
+multidict==6.1.0
+multiprocess==0.70.16
+nest-asyncio==1.6.0
+networkx==3.4.2
+numpy==1.26.4
+nvidia-cublas-cu12==12.1.3.1
+nvidia-cuda-cupti-cu12==12.1.105
+nvidia-cuda-nvrtc-cu12==12.1.105
+nvidia-cuda-runtime-cu12==12.1.105
+nvidia-cudnn-cu12==9.1.0.70
+nvidia-cufft-cu12==11.0.2.54
+nvidia-curand-cu12==10.3.2.106
+nvidia-cusolver-cu12==11.4.5.107
+nvidia-cusparse-cu12==12.1.0.106
+nvidia-nccl-cu12==2.20.5
+nvidia-nvjitlink-cu12==12.6.77
+nvidia-nvtx-cu12==12.1.105
+outlines==0.1.1
+outlines_core==0.1.14
+packaging==24.1
+pandas==2.2.3
+propcache==0.2.0
+protobuf==4.25.5
+psutil==6.1.0
+pyarrow==17.0.0
+pycountry==24.6.1
+pydantic==2.9.2
+pydantic_core==2.23.4
+Pygments==2.18.0
+python-dateutil==2.9.0.post0
+pytz==2024.2
+PyYAML==6.0.2
+referencing==0.35.1
+regex==2024.9.11
+requests==2.32.3
+rich==13.9.3
+rpds-py==0.20.0
+safetensors==0.4.5
+shellingham==1.5.4
+sigtools==4.0.1
+six==1.16.0
+sniffio==1.3.1
+starlette==0.41.0
+sympy==1.13.3
+synchronicity==0.8.3
+tokenizers==0.20.1
+toml==0.10.2
+torch==2.4.0
+tqdm==4.66.5
+transformers==4.46.0
+triton==3.0.0
+typer==0.12.5
+types-certifi==2021.10.8.3
+types-toml==0.10.8.20240310
+typing_extensions==4.12.2
+tzdata==2024.2
+urllib3==2.2.3
+watchfiles==0.24.0
+xxhash==3.5.0
+yarl==1.16.0