From 59bb3da42454c6a98a26826afc0a31d128265343 Mon Sep 17 00:00:00 2001
From: John Strunk <jstrunk@redhat.com>
Date: Thu, 16 May 2024 21:13:44 +0000
Subject: [PATCH] Initial draft of token estimator script

Signed-off-by: John Strunk <jstrunk@redhat.com>
---
 bot.py               |   4 +-
 estimator.py         | 126 +++++++++++++++++++++++++++++++++++++++++++
 jira-summarizer.yaml |   2 +-
 jiraissues.py        |  28 +++++++++-
 summarize_issue.py   |  14 +++--
 summarizer.py        |  24 +++------
 6 files changed, 171 insertions(+), 27 deletions(-)
 create mode 100755 estimator.py

diff --git a/bot.py b/bot.py
index 529522b..e3665d4 100755
--- a/bot.py
+++ b/bot.py
@@ -77,9 +77,9 @@ def main():
                 print(f"Summarized {issue_key} ({elapsed}s):\n{summary}\n")
             since = start_time  # Only update if we succeeded
         except requests.exceptions.HTTPError as error:
-            logging.error("HTTPError exception: %s", error, stack_info=True)
+            logging.error("HTTPError exception: %s", error.response.reason)
         except requests.exceptions.ReadTimeout as error:
-            logging.error("ReadTimeout exception: %s", error, stack_info=True)
+            logging.error("ReadTimeout exception: %s", error, exc_info=True)
         logging.info(
             "Cache stats: %d hits, %d total", issue_cache.hits, issue_cache.tries
         )
diff --git a/estimator.py b/estimator.py
new file mode 100755
index 0000000..02dbda4
--- /dev/null
+++ b/estimator.py
@@ -0,0 +1,126 @@
+#! /usr/bin/env python
+
+"""Estimate the issue change rate and necessary token throughput"""
+
+import argparse
+import logging
+import os
+import time
+from dataclasses import dataclass
+from datetime import datetime, timedelta
+
+import requests
+from atlassian import Jira  # type: ignore
+
+from jiraissues import Issue, get_self, issue_cache
+
+
+@dataclass
+class IssueEstimate:
+    """Data class to hold the estimate information"""
+
+    key: str
+    issue_type: str
+    updated: datetime
+    child_count: int
+    comment_count: int
+    tokens: int
+
+    def __str__(self) -> str:
+        return f"{self.key} ({self.issue_type}): {self.tokens} tokens"
+
+    @classmethod
+    def csv_header(cls) -> str:
+        """Return the CSV header line"""
+        return "key,issue_type,updated,child_count,comment_count,tokens"
+
+    def as_csv(self) -> str:
+        """Return the CSV representation of the data"""
+        return ",".join(
+            [
+                self.key,
+                self.issue_type,
+                self.updated.isoformat(),
+                str(self.child_count),
+                str(self.comment_count),
+                str(self.tokens),
+            ]
+        )
+
+
+def estimate_issue(issue: Issue) -> IssueEstimate:
+    """Estimate the number of tokens needed to summarize the issue"""
+    return IssueEstimate(
+        key=issue.key,
+        issue_type=issue.issue_type,
+        updated=issue.updated,
+        child_count=len(issue.children),
+        comment_count=len(issue.comments),
+        tokens=0,  # Placeholder for now
+    )
+
+
+def get_modified_issues(client: Jira, since: datetime) -> list[Issue]:
+    """Get issues modified since the given date/time"""
+    user_zi = get_self(client).tzinfo
+    since_string = since.astimezone(user_zi).strftime("%Y-%m-%d %H:%M")
+
+    issues = client.jql(
+        f"updated >= '{since_string}' ORDER BY updated DESC",
+        limit=1000,
+        fields="key",
+    )
+    if not isinstance(issues, dict):
+        return []
+    issue_cache.clear()
+    return [issue_cache.get_issue(client, issue["key"]) for issue in issues["issues"]]
+
+
+def main() -> None:
+    """Main function"""
+    parser = argparse.ArgumentParser(description="Estimator")
+    # pylint: disable=duplicate-code
+    parser.add_argument(
+        "--log-level",
+        default="WARNING",
+        choices=["DEBUG", "INFO", "WARNING", "ERROR", "CRITICAL"],
+        help="Set the logging level",
+    )
+    parser.add_argument(
+        "-s",
+        "--seconds",
+        type=int,
+        default=300,
+        help="Seconds to wait between iterations",
+    )
+
+    args = parser.parse_args()
+    logging.basicConfig(level=getattr(logging, args.log_level))
+    delay: int = args.seconds
+
+    jira = Jira(url=os.environ["JIRA_URL"], token=os.environ["JIRA_TOKEN"])
+
+    print(IssueEstimate.csv_header())
+    since = datetime.now() + timedelta(seconds=-delay)
+    while True:
+        start_time = datetime.now()
+        logging.info("Starting iteration at %s", start_time.isoformat())
+        try:
+            issues = get_modified_issues(jira, since)
+            for issue in issues:
+                print(estimate_issue(issue).as_csv())
+            since = start_time  # Only update if we succeeded
+        except requests.exceptions.HTTPError as error:
+            logging.error("HTTPError exception: %s", error.response.reason)
+        except requests.exceptions.ReadTimeout as error:
+            logging.error("ReadTimeout exception: %s", error, exc_info=True)
+        logging.info(
+            "Cache stats: %d hits, %d total", issue_cache.hits, issue_cache.tries
+        )
+        print(f"Iteration elapsed time: {datetime.now() - start_time}")
+        print(f"{'='*20} Sleeping for {delay} seconds {'='*20}")
+        time.sleep(delay)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/jira-summarizer.yaml b/jira-summarizer.yaml
index cf51ebf..1ac6ea5 100644
--- a/jira-summarizer.yaml
+++ b/jira-summarizer.yaml
@@ -24,7 +24,7 @@ spec:
             - "--log-level"
             - "INFO"
             - "--seconds"
-            - "300"
+            - "120"
           envFrom:
             - secretRef:
                 name: jira-summarizer-secret
diff --git a/jiraissues.py b/jiraissues.py
index eee7732..fea3c3b 100644
--- a/jiraissues.py
+++ b/jiraissues.py
@@ -18,7 +18,7 @@
 CF_STATUS_SUMMARY = "customfield_12320841"  # string
 
 # How long to delay between API calls
-CALL_DELAY_SECONDS: float = 0.2
+MIN_CALL_DELAY: float = 0.2
 
 
 @dataclass
@@ -382,6 +382,9 @@ def update_status_summary(self, contents: str) -> None:
         issue_cache.remove(self.key)  # Invalidate any cached copy
 
 
+_last_call_time = datetime.now()
+
+
 def _check(response: Any) -> dict:
     """
     Check the response from the Jira API and raise an exception if it's an
@@ -392,7 +395,15 @@ def _check(response: Any) -> dict:
     general, when things go well, you get back a dict. Otherwise, you could get
     anything.
     """
-    sleep(CALL_DELAY_SECONDS)
+    # Here, we throttle the API calls to avoid hitting the rate limit of the Jira server
+    global _last_call_time  # pylint: disable=global-statement
+    now = datetime.now()
+    delta = now - _last_call_time
+    required_delay = MIN_CALL_DELAY - delta.total_seconds()
+    if required_delay > 0:
+        sleep(required_delay)
+    _last_call_time = now
+
     if isinstance(response, dict):
         return response
     raise ValueError(f"Unexpected response: {response}")
@@ -413,6 +424,19 @@ def __init__(self, client: Jira) -> None:
         self.tzinfo = ZoneInfo(self.timezone)
 
 
+_self: Optional[Myself] = None
+
+
+def get_self(client: Jira) -> Myself:
+    """
+    Caching function for the Myself object.
+    """
+    global _self  # pylint: disable=global-statement
+    if _self is None:
+        _self = Myself(client)
+    return _self
+
+
 class IssueCache:
     """
     A cache of Jira issues to avoid fetching the same issue multiple times.
diff --git a/summarize_issue.py b/summarize_issue.py
index f3889f5..05cfc38 100755
--- a/summarize_issue.py
+++ b/summarize_issue.py
@@ -6,6 +6,7 @@
 import logging
 import os
 
+import requests
 from atlassian import Jira  # type: ignore
 
 from jiraissues import Issue
@@ -51,10 +52,15 @@ def main():
     jira = Jira(url=os.environ["JIRA_URL"], token=os.environ["JIRA_TOKEN"])
 
     issue = Issue(jira, args.jira_issue_key)
-    out = summarize_issue(
-        issue, regenerate=regenerate, max_depth=max_depth, send_updates=send_updates
-    )
-    print(out)
+    try:
+        out = summarize_issue(
+            issue, regenerate=regenerate, max_depth=max_depth, send_updates=send_updates
+        )
+        print(out)
+    except requests.exceptions.HTTPError as error:
+        logging.error("HTTPError exception: %s", error.response.reason)
+    except requests.exceptions.ReadTimeout as error:
+        logging.error("ReadTimeout exception: %s", error, exc_info=True)
 
 
 if __name__ == "__main__":
diff --git a/summarizer.py b/summarizer.py
index 5e8ca7d..b0424a9 100644
--- a/summarizer.py
+++ b/summarizer.py
@@ -5,7 +5,7 @@
 import os
 import textwrap
 from datetime import UTC, datetime
-from typing import List, Optional, Tuple, Union
+from typing import List, Tuple, Union
 
 from atlassian import Jira  # type: ignore
 from genai import Client, Credentials
@@ -19,7 +19,7 @@
 from langchain_core.language_models import LLM
 
 import text_wrapper
-from jiraissues import Issue, Myself, RelatedIssue, issue_cache
+from jiraissues import Issue, RelatedIssue, get_self, issue_cache
 
 _logger = logging.getLogger(__name__)
 
@@ -44,18 +44,6 @@
 
 _wrapper = text_wrapper.TextWrapper(SUMMARY_START_MARKER, SUMMARY_END_MARKER)
 
-_self: Optional[Myself] = None
-
-
-def self(client: Jira) -> Myself:
-    """
-    Caching function for the Myself object.
-    """
-    global _self  # pylint: disable=global-statement
-    if _self is None:
-        _self = Myself(client)
-    return _self
-
 
 # pylint: disable=too-many-locals
 def summarize_issue(
@@ -209,9 +197,9 @@ def summary_last_updated(issue: Issue) -> datetime:
         return last_update
 
     for change in issue.changelog:
-        if change.author == self(issue.client).display_name and "Status Summary" in [
-            chg.field for chg in change.changes
-        ]:
+        if change.author == get_self(
+            issue.client
+        ).display_name and "Status Summary" in [chg.field for chg in change.changes]:
             last_update = max(last_update, change.created)
 
     return last_update
@@ -338,7 +326,7 @@ def get_issues_to_summarize(
     """
     # The time format for the query needs to be in the local timezone of the
     # user, so we need to convert
-    user_zi = self(client).tzinfo
+    user_zi = get_self(client).tzinfo
     since_string = since.astimezone(user_zi).strftime("%Y-%m-%d %H:%M")
     updated_issues = client.jql(
         f"labels = '{SUMMARY_ALLOWED_LABEL}' and updated >= '{since_string}' ORDER BY updated DESC",