wowtoken.app-backend/wow-token-historical.py
Emily Doherty 27cd98ee52 Removing the usage of Timestream
Moving to using a DynamoDB global table with compacted writing
2024-10-29 15:13:41 -07:00

345 lines
12 KiB
Python

import sys
from typing import List, Dict
import boto3
from boto3.dynamodb.conditions import Key
from collections import deque
import datetime
import calendar
import json
import os
import statistics
dynamo_region_map = {
'us-west-1': 'us-west-1',
'us-west-2': 'us-west-2',
'us-east-1': 'us-east-1',
'us-east-2': 'us-east-2',
'ap-south-1': 'eu-north-1',
'ap-northeast-3': 'ap-northeast-1',
'ap-northeast-2': 'ap-northeast-1',
'ap-southeast-1': 'ap-southeast-1',
'ap-southeast-2': 'ap-southeast-2',
'ap-northeast-1': 'ap-northeast-1',
'ca-central-1': 'us-east-1',
'eu-central-1': 'eu-north-1',
'eu-west-1': 'eu-west-1',
'eu-west-2': 'eu-west-1',
'eu-west-3': 'eu-west-3',
'eu-north-1': 'eu-north-1',
'sa-east-1': 'sa-east-1',
'eu-south-1': 'eu-north-1'
} # This is a rough first pass at an intelligent region selector based on what is replicated
local_region = ''
if os.environ['AWS_REGION'] in dynamo_region_map:
local_dynamo_region = dynamo_region_map[os.environ['AWS_REGION']]
else:
local_dynamo_region = 'eu-central-1'
local_timestream_region = 'eu-central-1'
timestream_client = boto3.client('timestream-query', region_name='us-east-1')
dynamodb_client = boto3.resource('dynamodb', region_name=local_dynamo_region)
tables = {
'retail': {
'recent': 'wow-token-price-recent',
'current': 'wow-token-price',
'compacted': 'wow-token-compacted',
'timestream': 'wow-token-price-history'
},
'classic': {
'recent': 'wow-token-classic-price-recent',
'current': 'wow-token-classic-price',
'compacted': 'wow-token-compacted',
'timestream': 'wow-token-classic-price-history'
}
}
def historical_data(time, region, version):
# This shim is to permanently change the URL of 30d to 720h for local caching,
# There seems to be at least 1 person using 30d (strangely with no .json) which was deprecated
# as the data source for 1 month of data years ago
if time == '30d':
time = '720h'
if time[-1] == 'h':
return dynamo_data(time, region, version)
else:
return dynamo_compacted(time, region, version)
def _get_dynamo_compacted(time: str, region: str, version: str) -> List[Dict[str, int|str]]:
table = dynamodb_client.Table(tables[version]['compacted'])
pk = f'{region}-{version}-{time}'
response = table.query(
KeyConditionExpression=(
Key('region-flavor-timestamp').eq(pk)
)
)
response_data = sorted(response['Items'][0]['data'].items())
data = []
for item in response_data:
data.append({
'time': datetime.datetime.fromtimestamp(
int(item[0]),
tz=datetime.UTC).isoformat(),
'value': int(item[1])
})
return data
def dynamo_compacted(time: str, region: str, version: str) -> List[Dict[str, int]]:
return _get_dynamo_compacted(time, region, version)
def dynamo_data(time, region, version):
print(f"Function region: {os.environ['AWS_REGION']}\t Dynamo Region: {local_region}")
time_stripped = int(time[:-1])
start_time = datetime.datetime.utcnow() - datetime.timedelta(hours=time_stripped)
start_time_utc = start_time.replace(tzinfo=datetime.timezone.utc)
table = dynamodb_client.Table(tables[version]['recent'])
response = table.query(
KeyConditionExpression=(
Key('region').eq(region) &
Key('timestamp').gte(int(start_time_utc.timestamp()))))
data = []
last_price = 0
for item in response['Items']:
price = int(int(item['price']) / 10000)
if last_price != price:
item_time = datetime.datetime.utcfromtimestamp(int(item['timestamp'])).replace(
tzinfo=datetime.timezone.utc).isoformat()
data.append({
'time': item_time,
'value': price
})
last_price = price
return data
def aggregate_data(aggregate_function: str, data: list):
if aggregate_function == 'daily_max':
return max_min(1, 1, data)
elif aggregate_function == 'daily_min':
return max_min(-1, 1, data)
elif aggregate_function == 'daily_mean':
return mean(1, data)
elif aggregate_function == 'weekly_max':
return max_min(1, 7, data)
elif aggregate_function == 'weekly_min':
return max_min(-1, 7, data)
elif aggregate_function == 'weekly_mean':
return mean(7, data)
def date_in_range(day_range: tuple, date: datetime.datetime):
month_range = calendar.monthrange(date.year, date.month)
if day_range[0] <= date.day < day_range[1]:
return True
elif date.day < day_range[1] and date.day < day_range[0]:
# TODO: I am probably missing a sanity check here, come back to it
return True
else:
return False
def day_bucket(bucket_size: int, date: datetime.datetime) -> tuple[datetime.datetime, datetime.datetime]:
month_range = calendar.monthrange(date.year, date.month)
days_to_reset = {0: 1, 1: 0, 2: 6, 3: 5, 4: 4, 5: 3, 6: 2}
# We want the bucket boundaries for a bucket size of 7 to fall on
# reset day (index 1), and for a month (31) to fall on the actual boundaries of that month
# this means month-to-month, there are dynamic sizing of buckets
# TODO: Monthly boundaries
if bucket_size == 7 and date.weekday() != 1:
# This is WoW, the week starts on Tuesday (datetime index 1)
bucket_size = days_to_reset[date.weekday()]
return tuple((date, date + datetime.timedelta(days=bucket_size)))
def is_new_bucket(d_datetime: datetime.datetime, current_bucket_day: datetime.datetime.day, bucket: tuple) -> bool:
if d_datetime.day != current_bucket_day and (d_datetime >= bucket[1] or d_datetime.weekday() == 1):
return True
return False
def __sum_total(__data: list) -> int:
__total = 0
for __d in __data:
__total += __d['value']
return __total
def max_min(fn: int, bucket_size: int, data: list) -> list:
new_data = []
first_date = datetime.datetime.fromisoformat(data[0]['time'])
current_bucket_day = first_date.day
# I hate working with dates
bucket = day_bucket(bucket_size, first_date)
min_max = {'minimum': 999_999_999, 'maximum': 0}
min_max_date = {'minimum_date': datetime.datetime.min, 'maximum_date': datetime.datetime.max}
for d in data:
d_datetime = datetime.datetime.fromisoformat(d['time'])
# current_day is used to check if this 'if' has triggered for a new bucket and bypass if it has
if is_new_bucket(d_datetime, current_bucket_day, bucket):
current_bucket_day = d_datetime.day
bucket = day_bucket(bucket_size, d_datetime)
if fn == -1: # Minimum function
new_data.append({'time': min_max_date['minimum_date'], 'value': min_max['minimum']})
elif fn == 1: # Maximum function
new_data.append({'time': min_max_date['maximum_date'], 'value': min_max['maximum']})
min_max = {'minimum': 999_999_999, 'maximum': 0}
min_max_date = {
'minimum_date': datetime.datetime.min.isoformat(),
'maximum_date': datetime.datetime.max.isoformat()
}
if d['value'] < min_max['minimum']:
min_max['minimum'] = d['value']
min_max_date['minimum_date'] = d_datetime.isoformat()
if d['value'] > min_max['maximum']:
min_max['maximum'] = d['value']
min_max_date['maximum_date'] = d_datetime.isoformat()
return new_data
def mean(bucket_size: int, data: list) -> list:
new_data = []
first_date = datetime.datetime.fromisoformat(data[0]['time'])
current_bucket_day = first_date.day
bucket = day_bucket(bucket_size, first_date)
mean_bucket = []
bucket_date = first_date
for d in data:
d_datetime = datetime.datetime.fromisoformat(d['time'])
if is_new_bucket(d_datetime, current_bucket_day, bucket):
current_bucket_day = d_datetime.day
bucket = day_bucket(bucket_size, d_datetime)
new_data.append({'time': bucket[0].isoformat(), 'value': int(statistics.mean(mean_bucket))})
mean_bucket = []
mean_bucket.append(d['value'])
return new_data
# TODO FIXME
def simple_moving_average(hours: int, data: list) -> list:
# The cyclomatic complexity of this function is getting high, I need to figure out a more elegant solution
new_data = []
queue = deque()
hours_in_queue = 0
head_date = datetime.datetime.fromisoformat(data[8]['time'])
for datum in data:
datum_datetime = datetime.datetime.fromisoformat(datum['time'])
if datum_datetime.hour == head_date.hour:
queue.append(datum)
elif datum_datetime.hour != head_date.hour:
if hours_in_queue == hours:
q_list = list(queue)
total = __sum_total(q_list)
new_datum = {
'value': int(total / len(q_list)),
'time': head_date.isoformat()
}
new_data.append(new_datum)
deque_val = 0
for d in q_list:
__dt = datetime.datetime.fromisoformat(d['time'])
if __dt.hour == head_date.hour and __dt.day == __dt.day:
deque_val += 1
while deque_val != 0:
queue.pop()
deque_val -= 1
hours_in_queue -= 1
head_date = datum_datetime
elif hours_in_queue < 5:
queue.append(datum)
hours_in_queue += 1
return new_data
def moving_weighted_average(days: int, data: list) -> list:
pass
def validate_path(split_uri: list) -> bool:
if not split_uri[-1].endswith('json'):
return False
if not validate_region(split_uri[-2]):
return False
if not validate_time(split_uri[-1].split('.')[0]):
return False
return True
def validate_time(time: str) -> bool:
# These can probably be rewritten as a lambda but at the time I am writing this I am just doing a first pass
if time[-1] == 'h':
hours = int(time[0:-1])
return (hours >= 24) and (hours < 1000)
if time[-1] == 'd':
days = int(time[0:-1])
return (days >= 30) and (days <= 100)
if time[-1] == 'm':
months = int(time[0:-1])
return (months >= 1) and (months <= 12)
if time[-1] == 'y':
years = int(time[0:-1])
return (years >= 1) and (years <= 10)
return time == 'all'
def validate_region(region: str) -> bool:
valid_regions = ['us', 'eu', 'tw', 'kr']
return region in valid_regions
def validate_aggregate(aggregate_function: str) -> bool:
valid_aggregates = ['daily_max', 'daily_min', 'daily_mean', 'weekly_max', 'weekly_min', 'weekly_mean']
return aggregate_function in valid_aggregates
def lambda_handler(event, context):
uri = event['Records'][0]['cf']['request']['uri']
split_uri = uri.split('/')
if validate_path(split_uri):
if 'classic' in split_uri:
version = 'classic'
else:
version = 'retail'
time = split_uri[-1].split('.')[0]
region = split_uri[-2]
aggregate_function = split_uri[-3]
data = historical_data(time, region, version)
if validate_aggregate(aggregate_function):
data = aggregate_data(aggregate_function, data)
response = {'status': '200', 'statusDescription': 'OK', 'headers': {}}
response['headers']['content-type'] = [{'key': 'Content-Type', 'value': 'application/json'}]
response['body'] = json.dumps(data)
return response
else:
return {'status': '404', 'statusDescription': 'NotFound', 'headers': {}}
def main():
pass
#data = dynamo_compacted('1y', 'us', 'retail')
#print(data)
if __name__ == '__main__':
main()