Skip to content

Commit

Permalink
Feature/upgrade to ga4 (#2)
Browse files Browse the repository at this point in the history
* Upgrades to GA4

* Create GA4 methods to load data

* Update the resource and packages data

* Pinned google-analytics-data to allow only minor and patch updates

---------

Co-authored-by: Konstantin Sivakov <[email protected]>
  • Loading branch information
MarkCalvert and tino097 authored Jun 13, 2023
1 parent ef084a2 commit 4a8e06b
Show file tree
Hide file tree
Showing 5 changed files with 146 additions and 26 deletions.
165 changes: 141 additions & 24 deletions ckanext/googleanalytics/cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,12 +7,14 @@
import logging
import click
import ckan.model as model
from ckan.plugins import toolkit as tk

from . import dbutil, config

log = logging.getLogger(__name__)
PACKAGE_URL = "/dataset/" # XXX get from routes...

DATASET_URL_REGEX = re.compile("^/dataset/([a-z0-9-_]+)$")
RESOURCE_URL_REGEX = re.compile("/dataset/[a-z0-9-_]+/resource/([a-z0-9-_]+)")
DATASET_EDIT_REGEX = re.compile("/dataset/edit/([a-z0-9-_]+)")

Expand All @@ -38,32 +40,49 @@ def init():
@googleanalytics.command(short_help=u"Load data from Google Analytics API")
@click.argument("credentials", type=click.Path(exists=True))
@click.option("-s", "--start-date", required=False)
def load(credentials, start_date):
@click.option("-ga4", "--ga4", required=False, is_flag=True)
def load(credentials, start_date, ga4):
"""Parse data from Google Analytics API and store it
in a local database
"""
from .ga_auth import init_service, get_profile_id

try:
service = init_service(credentials)
except TypeError as e:
raise Exception("Unable to create a service: {0}".format(e))
profile_id = get_profile_id(service)
if not profile_id:
tk.error_shout("Unknown Profile ID. `googleanalytics.profile_id` or `googleanalytics.account` must be specified")
raise click.Abort()
if start_date:
bulk_import(service, profile_id, start_date)
if not ga4:
from .ga_auth import init_service, get_profile_id

try:
service = init_service(credentials)
except TypeError as e:
raise Exception("Unable to create a service: {0}".format(e))
profile_id = get_profile_id(service)
if not profile_id:
tk.error_shout("Unknown Profile ID. `googleanalytics.profile_id` or `googleanalytics.account` must be specified")
raise click.Abort()
if start_date:
bulk_import(service, profile_id, start_date)
else:
query = "ga:pagePath=~%s,ga:pagePath=~%s" % (
PACKAGE_URL,
config.prefix(),
)
packages_data = get_ga_data(service, profile_id, query_filter=query)
save_ga_data(packages_data)
log.info("Saved %s records from google" % len(packages_data))

else:
query = "ga:pagePath=~%s,ga:pagePath=~%s" % (
PACKAGE_URL,
config.prefix(),
)
packages_data = get_ga_data(service, profile_id, query_filter=query)
save_ga_data(packages_data)
from .ga_auth import get_ga4_client
try:
client = get_ga4_client(credentials)
except TypeError as e:
raise Exception("Unable to create a client: {0}".format(e))

property_id = tk.config.get("googleanalytics.property_id")
if not property_id:
tk.error_shout("Unknown Property ID. `googleanalytics.property_id`")
raise click.Abort()

packages_data = get_ga4_data(client, property_id)
save_ga4_data(packages_data)
log.info("Saved %s records from google" % len(packages_data))


###############################################################################
# xxx #
###############################################################################
Expand Down Expand Up @@ -150,7 +169,7 @@ def internal_save(packages_data, summary_date):

def bulk_import(service, profile_id, start_date=None):
if start_date:
# Get summeries from specified date
# Get summaries from specified date
start_date = datetime.datetime.strptime(start_date, "%Y-%m-%d")
else:
# No date given. See when we last have data for and get data
Expand Down Expand Up @@ -183,7 +202,7 @@ def bulk_import(service, profile_id, start_date=None):


def get_ga_data_new(service, profile_id, start_date=None, end_date=None):
"""Get raw data from Google Analtyics for packages and
"""Get raw data from Google Analytics for packages and
resources.
Returns a dictionary like::
Expand All @@ -203,7 +222,7 @@ def get_ga_data_new(service, profile_id, start_date=None, end_date=None):

start_index = 1
max_results = 10000
# data retrival is chunked
# data retrieval is chunked
completed = False
while not completed:
results = (
Expand Down Expand Up @@ -239,6 +258,40 @@ def get_ga_data_new(service, profile_id, start_date=None, end_date=None):
return packages


def save_ga4_data(packages):
for identifier, visits in list(packages.items()):
recently = visits.get("recent", 0)
ever = visits.get("ever", 0)
package_name = identifier[len(PACKAGE_URL):]
resource_matches = RESOURCE_URL_REGEX.match(identifier)
dataset_matches = DATASET_URL_REGEX.match(identifier)
if resource_matches:
resource_url = resource_matches[0]
resource = model.Resource.get(resource_url)
resource = (
model.Session.query(model.Resource)
.autoflush(True)
.filter_by(id=resource_matches.group(1))
.first()
)
if not resource:
log.warning("Couldn't find resource %s" % resource_url)
continue
dbutil.update_resource_visits(resource.id, recently, ever)
log.info("Updated %s with %s visits" % (resource.id, visits))
elif dataset_matches:
item = model.Package.by_name(package_name)
if not item:
log.warning("Couldn't find package %s" % package_name)
continue
dbutil.update_package_visits(item.id, recently, ever)
log.info("Updated %s with %s visits" % (item.id, visits))
else:
log.warning(f"No matches for identifier {identifier}")
continue
model.Session.commit()


def save_ga_data(packages_data):
"""Save tuples of packages_data to the database"""
for identifier, visits in list(packages_data.items()):
Expand Down Expand Up @@ -272,6 +325,42 @@ def save_ga_data(packages_data):
model.Session.commit()


def ga4_query(client, property_id, from_date=None):
"""Execute a query against Google Analytics 4"""
from google.analytics.data_v1beta.types import (
DateRange,
Dimension,
Metric,
RunReportRequest,
FilterExpression,
Filter
)
now = datetime.datetime.now()
to_date = now.strftime("%Y-%m-%d")
if isinstance(from_date, datetime.date):
from_date = from_date.strftime("%Y-%m-%d")
# TODO: Make the metrics a CKAN config option which defaults to engagedSessions
metrics = [
Metric(name="engagedSessions"),
]
request = RunReportRequest(
property="properties/{0}".format(property_id),
# TODO: Make the dimensions a CKAN config option which defaults to pagePathPlusQueryString
dimensions=[Dimension(name="pagePathPlusQueryString")],
dimension_filter=FilterExpression(
filter=Filter(
field_name="pagePathPlusQueryString",
string_filter=Filter.StringFilter(
value=PACKAGE_URL, match_type=Filter.StringFilter.MatchType.BEGINS_WITH)
)
),
metrics=metrics,
date_ranges=[DateRange(start_date=from_date, end_date=to_date)],
)
response = client.run_report(request)
return response


def ga_query(
service,
profile_id,
Expand Down Expand Up @@ -309,8 +398,36 @@ def ga_query(
return results


def get_ga4_data(client, property_id):
"""Get raw data from Google Analytics 4 for packages"""

now = datetime.datetime.now()
recent_date = now - datetime.timedelta(config.recent_view_days())
recent_date = recent_date.strftime("%Y-%m-%d")
# TODO: Make this configurable. For some reason no data is returned for DataNT when the date is earlier than 2021-02-01
floor_date = datetime.date(2021, 2, 1)
packages = {}
dates = {"recent": recent_date, "ever": floor_date}
for date_name, date in list(dates.items()):

response = ga4_query(
client,
property_id,
from_date=date)
for row in response.rows:
package = row.dimension_values[0].value
count = row.metric_values[0].value
val = 0
if package in packages and date_name in packages[package]:
val += packages[package][date_name]
packages.setdefault(package, {})[date_name] = (
val + int(count)
)
return packages


def get_ga_data(service, profile_id, query_filter):
"""Get raw data from Google Analtyics for packages and
"""Get raw data from Google Analytics for packages and
resources, and for both the last two weeks and ever.
Returns a dictionary like::
Expand Down
2 changes: 1 addition & 1 deletion ckanext/googleanalytics/config.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,7 @@


def tracking_id():
return tk.config["googleanalytics.id"]
return tk.config.get("googleanalytics.id")


def download_handler():
Expand Down
3 changes: 3 additions & 0 deletions ckanext/googleanalytics/ga_auth.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@
from oauth2client.service_account import ServiceAccountCredentials

from ckanext.googleanalytics import utils, config
from google.analytics.data_v1beta import BetaAnalyticsDataClient


def init_service(credentials_file):
Expand All @@ -13,6 +14,8 @@ def init_service(credentials_file):

return build("analytics", "v3", credentials=credentials)

def get_ga4_client(credentials_json_path):
return BetaAnalyticsDataClient.from_service_account_json(credentials_json_path)

def get_profile_id(service):
"""Get static profile ID or fetch one from the service.
Expand Down
1 change: 0 additions & 1 deletion ckanext/googleanalytics/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -52,7 +52,6 @@ def _mp_api_handler(data_dict):
}]
}, cls=SafeJSONEncoder)
)
# breakpoint()
if resp.status_code >= 300:
log.error("Cannot post event: %s", resp)

Expand Down
1 change: 1 addition & 0 deletions requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -2,3 +2,4 @@ gdata>=2.0.0
google-api-python-client>=1.6.1, <1.7.0
pyOpenSSL>=16.2.0
rsa>=3.1.4, <=4.0
google-analytics-data>=0.16.0, <1.0.0

0 comments on commit 4a8e06b

Please sign in to comment.