Skip to content

Commit

Permalink
Switch to using the official Prometheus Python client
Browse files Browse the repository at this point in the history
Closes #5
  • Loading branch information
emphoeller committed Apr 16, 2024
1 parent 7322116 commit fc3555d
Show file tree
Hide file tree
Showing 4 changed files with 66 additions and 134 deletions.
9 changes: 7 additions & 2 deletions Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -4,9 +4,14 @@ USER root:root

# https://techoverflow.net/2021/01/13/how-to-use-apt-install-correctly-in-your-dockerfile/
ENV DEBIAN_FRONTEND=noninteractive
RUN apt update && apt install -y python3 && rm -rf /var/lib/apt/lists/*
RUN apt update && apt install -y python3 python3-venv && rm -rf /var/lib/apt/lists/*

COPY --link --chmod=755 coturn_exporter /coturn_exporter_files/
RUN python3 -m venv /coturn_exporter_files/venv/
RUN /coturn_exporter_files/venv/bin/pip install --no-cache-dir --upgrade pip
COPY --chmod=644 requirements.txt /coturn_exporter_files/
RUN /coturn_exporter_files/venv/bin/pip install --no-cache-dir -r /coturn_exporter_files/requirements.txt

COPY --chmod=755 coturn_exporter /coturn_exporter_files/
RUN ln -st /usr/local/bin/ /coturn_exporter_files/coturn_exporter

USER nobody:nogroup
Expand Down
11 changes: 6 additions & 5 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -13,11 +13,12 @@ All environment variables except `IP` are optional. `INTERVAL` (how many seconds

## Output

Metrics are exported on port 9524 under `/metrics`. A response looks like this:
Metrics are exported on port 9524. They will include something like the following (plus some metrics added by the [Prometheus Python client](https://github.com/prometheus/client_python)):

```
# TYPE is_turnserver_ok gauge
# HELP is_turnserver_ok Whether the TURN server is OK
is_turnserver_ok 1 2279953800.123456
# EOF
# HELP turn_server_state the state of the TURN server
# TYPE turn_server_state gauge
turn_server_state{turn_server_state="ok"} 1.0
turn_server_state{turn_server_state="unreachable"} 0.0
turn_server_state{turn_server_state="unknown"} 0.0
```
179 changes: 52 additions & 127 deletions coturn_exporter
Original file line number Diff line number Diff line change
@@ -1,22 +1,22 @@
#!/usr/bin/python3
#!/coturn_exporter_files/venv/bin/python

import sys
import os
import tempfile
import http.server
import threading
import subprocess
import shutil
import signal
import time
import prometheus_client

EXECUTABLE_NAME = 'turnutils_uclient'
SERVER_PORT = 9524
# Seconds until the executable receives SIGTERM
TIMEOUT_SIGTERM = 100
# Seconds after that until the executable receives SIGKILL
TIMEOUT_SIGKILL = 5
# Number of consecutive failures to determine the server status before HTTP
# requests to /metrics will be responded to with 500 Internal Server Error
# Number of consecutive failures to determine the server status before
# turn_server_state will be set to "unknown"
MAX_FAILURES = 5

# Stupid-yet-effective way to prevent errors of the sort
Expand Down Expand Up @@ -73,10 +73,6 @@ executable_path = shutil.which(EXECUTABLE_NAME)
if not executable_path:
error(f'could not locate {EXECUTABLE_NAME}')

tempdir = tempfile.mkdtemp()
os.chdir(tempdir)
os.mkdir('server')

shutdown_event = threading.Event()

def shutdown_signal_handler(signum, frame):
Expand All @@ -90,88 +86,11 @@ signal.signal(signal.SIGPIPE, shutdown_signal_handler)
signal.signal(signal.SIGTERM, shutdown_signal_handler)
signal.signal(signal.SIGQUIT, shutdown_signal_handler)

class MetricsRequestHandler(http.server.BaseHTTPRequestHandler):
protocol_version = 'HTTP/1.1'
def _head_get_impl(self, include_body):
if self.path.split('?')[0].lower() not in ('/metrics', '/metrics/'):
self.send_error(404)
return
try:
with open('server/metrics', 'rb') as f:
mtime = os.fstat(f.fileno()).st_mtime
data = f.read()
except Exception:
self.send_error(500)
return
self.send_response(200)
self.send_header('Content-Length', str(len(data)))
self.send_header(
'Content-Type',
'application/openmetrics-text; version=1.0.0; charset=utf-8')
self.send_header(
'Last-Modified',
time.strftime('%a, %d %b %Y %H:%M:%S GMT', time.gmtime(mtime)))
self.end_headers()
if include_body:
self.wfile.write(data)
def do_HEAD(self):
self._head_get_impl(include_body=False)
def do_GET(self):
self._head_get_impl(include_body=True)

server = http.server.ThreadingHTTPServer(
('0.0.0.0', 9524), MetricsRequestHandler)
server_thread = threading.Thread(
target=server.serve_forever, name='server_thread', daemon=True)

class ShutdownException(Exception):
pass

def wait_and_check_shutdown(process, timeout):
wait_until = time.monotonic_ns() + timeout * 1e9
while True:
ex = None
try:
process.wait(0.1)
except subprocess.TimeoutExpired as e:
ex = e
if shutdown_event.is_set():
raise ShutdownException
if not ex:
return
if wait_until <= time.monotonic_ns():
raise ex

failures = 0

def check_failed():
if failures < MAX_FAILURES:
failures += 1
return
warn('too many consecutive failures, the metrics server now responds with '
'500 Internal Server Error')
try:
os.remove('server/metrics')
except FileNotFoundError:
pass
except Exception as e:
warn_exception('could not remove "server/metrics"', e)

def check_succeeded(server_ok):
failures = 0
try:
with open('metrics_prep', 'wb') as f:
f.write(
b'# TYPE is_turnserver_ok gauge\n'
b'# HELP is_turnserver_ok Whether the TURN server is OK\n'
b'is_turnserver_ok %b %f\n'
b'# EOF\n'
% (b'1' if server_ok else b'0', time.time())
)
os.rename('metrics_prep', 'server/metrics') # Atomic replacement
except Exception as e:
warn_exception(
'failed to set up "metrics_prep" or to overwrite "server/metrics"', e)
prometheus_client.disable_created_metrics()
turn_server_state_enum = prometheus_client.Enum('turn_server_state',
'the state of the TURN server', states=['ok', 'unreachable', 'unknown'])
turn_server_state_enum.state('unknown')
server, server_thread = prometheus_client.start_http_server(SERVER_PORT)

args = [
executable_path,
Expand All @@ -184,47 +103,53 @@ args = [
IP
]

def checker_thread_func():
while True:
# negative: failed to determine status, 0: is ok, positive: is not ok
# (NOTE: this is different from the exported boolean gauge, where 1 is ok
# and 0 is not ok)
check_result = None
process = subprocess.Popen(args)
failures = 0 # Consecutive failures to determine the server status
def failed():
global failures
if failures < MAX_FAILURES:
failures += 1
return
warn('too many consecutive failures determining the server status, setting '
'to unknown')
turn_server_state_enum.state('unknown')
while True:
process = subprocess.Popen(args)
wait_until = time.monotonic_ns() + TIMEOUT_SIGTERM * 1e9
while time.monotonic_ns() < wait_until and not shutdown_event.is_set():
try:
wait_and_check_shutdown(process, TIMEOUT_SIGTERM)
except (subprocess.TimeoutExpired, ShutdownException) as e:
if isinstance(e, subprocess.TimeoutExpired):
check_result = -1
warn(
f'{EXECUTABLE_NAME} could not determine the TURN server status '
f'within the timeout ({TIMEOUT_SECONDS} seconds), killing and '
'retrying')
process.terminate()
try:
process.wait(TIMEOUT_SIGKILL)
except subprocess.TimeoutExpired:
process.kill()
process.wait()
process.wait(0.1)
except subprocess.TimeoutExpired:
pass
else:
check_result = process.returncode
if process.returncode < 0:
warn(
f'{EXECUTABLE_NAME} was terminated unexpectedly by signal '
f'{-process.returncode}, retrying')
if shutdown_event.is_set():
break
check_succeeded(check_result == 0) if check_result >= 0 else check_failed()
if shutdown_event.wait(INTERVAL if check_result >= 0 else 1):
break
if shutdown_event.is_set():
break
time_until_next_check = 1
if process.returncode is None: # Still running
warn(
f'{EXECUTABLE_NAME} could not determine the TURN server status within '
f'the timeout ({TIMEOUT_SECONDS} seconds), killing and retrying')
failed()
process.terminate()
try:
process.wait(TIMEOUT_SIGKILL)
except subprocess.TimeoutExpired:
process.kill()
process.wait()
else:
if process.returncode < 0:
warn(
f'{EXECUTABLE_NAME} was terminated unexpectedly by signal '
f'{-process.returncode}, retrying')
failed()
else:
failures = 0
state, time_until_next_check = ('ok', INTERVAL) if process.returncode == 0 else ('unreachable', min(INTERVAL, 10))
turn_server_state_enum.state(state)
if shutdown_event.wait(time_until_next_check):
break

checker_thread = threading.Thread(
target=checker_thread_func, name='checker_thread', daemon=True)
checker_thread.start()
server_thread.start()
checker_thread.join() # Terminated only by signal
server.shutdown()
server_thread.join()
shutil.rmtree(tempdir)

sys.exit(0)
1 change: 1 addition & 0 deletions requirements.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
prometheus-client

0 comments on commit fc3555d

Please sign in to comment.