Gemfury

thinkdataworks / namara-python python

Repository URL to install this package:
Details
namara-python / cache.py
'''
Namara datasets that have been exported are cached locally to disk under CACHE_HOME directory. Etags are used as the file names; this doubles as a useful file location name and file uniqueness.
The caching strategy uses a cache-index file (CACHE_INDEX_PATH) to store {query: etag:download_url} pairs (base 64 encoded).

If there's no entry for a query, it will create one, and will create new dataset file under CACHE_HOME.

If there is an entry, it will make a HEAD request to the download url and compare the etag from the response vs whats stored in the cache-index. If they don't match, it will remove the current entry, add new a entry, and then write the new dataset file to the cache (in that order).

The cache-index (CACHE_INDEX dictionary) is loaded into memory when the Client module is imported. Each update to the in memory dict is written to disk as well to persist the cache across sessions.

'''

import json, os, requests
from base64 import b64encode, b64decode
from pandas import read_csv

'''
main directory for the cache, the cache-index and the exported files are located here.
clear out his folder if you want the clear the cache
'''
CACHE_HOME = os.path.join(os.getcwd(), '.cache')

# path to the cache-index file
CACHE_INDEX_PATH = os.path.join(CACHE_HOME, '.cache_index')

'''
this is a dictionary which stores query:etag+url pairs. its backed by a file, CACHE_INDEX_PATH, and should be loaded into memory on app start. namara_python.client does this, so you don't have to worry about doing this.
since cached files are named based on etag, this is what tells you which file to load based the export query
'''
CACHE_INDEX = {}

'''
Sets up the CACHE_HOME dir and CACHE_INDEX file at CACHE_INDEX_PATH
Does it in a safe way so even its not there, it will create those as necessary
'''
def load_cache_index():
    _safe_create_dir(CACHE_HOME)

    if os.path.exists(CACHE_INDEX_PATH):
        with open(CACHE_INDEX_PATH, 'r', encoding='utf-8') as f:
            global CACHE_INDEX
            CACHE_INDEX = json.load(f)
    else:
        with open(CACHE_INDEX_PATH, 'w', encoding='utf-8') as f:
            f.write('{}')


def dataset_from_cache(query):
    ''' Takes in a SQL query and will return a pandas dataframe if an export matching that query has been found. Returns None otherwise
    '''
    query_key = str_encode(query)

    etag_url = CACHE_INDEX.get(query_key)

    if not etag_url:
        return None

    etag_url = str_decode(etag_url).split(':', maxsplit=1)
    etag = etag_url[0]
    url = etag_url[1]

    res = requests.head(url)

    # file name is the hex encoded etag
    file_name = get_file_name_for_etag(etag)

    # file name is the etag itself
    file_path = os.path.join(CACHE_HOME, file_name)

    # NOTE: an etag may have quotes, but we will have stripped them in
    # order to create a valid file name on some OS's (windows)
    # remove them for looking up from cache
    if res.headers.get('ETag') is not None and etag == res.headers.get('ETag').replace('"', ''):
        # the try/except is here just to catch any discrepancies between the CACHE_INDEX and whether the file actually exists
        try:
            f = read_csv(file_path, encoding='utf-8')
        except FileNotFoundError:
            _remove_entry_from_cache_index(query_key)
            return None
        return f
    else:
        _remove_entry_from_cache(file_path, query_key)

    return None


def cache_dataset(query, data, etag, url):
    ''' Creates a new entry in the CACHE_INDEX using the query (base 64 encoded) as the key nd etag+url as the value (also base 64 encoded). Writes the given `data` to disk using `etag` as the file name
    '''

    # an etag may be wrapped in quotes but this will create an incorrect filename in some OS's (windows)
    # remove them before saving
    etag = etag.replace('"', '')
    query_key = str_encode(query)
    etag_url = str_encode(etag + ":" + url)

    CACHE_INDEX[query_key] = etag_url

    file_name = get_file_name_for_etag(etag)

    file_path = os.path.join(CACHE_HOME, file_name)
    with open(file_path, 'w', encoding='utf-8') as f:
        f.write(data)

    with open(CACHE_INDEX_PATH, 'w', encoding='utf-8') as f:
        json.dump(CACHE_INDEX, f)


def _remove_entry_from_cache(file_path, query_key):
    ''' Removes an entry from the cache, meaning removing the file itself and also it's entry in the CACHE_INDEX
    '''
    # remove entry and file
    # the try/except is here just to catch any discrepancies between the CACHE_INDEX and whether the file actually exists
    try:
        os.remove(file_path)
    except FileNotFoundError:
        pass

    _remove_entry_from_cache_index(query_key)


def _remove_entry_from_cache_index(query_key):
    ''' Removes an entry from the CACHE_INDEX dictionary and writes it to the file
    '''
    CACHE_INDEX.pop(query_key)
    with open(CACHE_INDEX_PATH, 'w', encoding='utf-8') as f:
        json.dump(CACHE_INDEX, f)


def _safe_create_dir(name):
    if not os.path.exists(name):
        os.makedirs(name)


def str_encode(s):
    '''
    Since we use are writing encoded values to a file, they have to be strings, and can't be bytes, which is what the encoding/decoding methods accept normally
    '''
    return b64encode(s.encode('utf-8')).decode('utf-8')


def str_decode(s):
    '''
    Since we use are writing encoded values to a file, they have to be strings, and can't be bytes, which is what the encoding/decoding methods accept normally
    '''
    return b64decode(s.encode('utf-8')).decode('utf-8')


def get_file_name_for_etag(etag):
    '''Use hex encoded strings for generating file names.
    Hexadecimal character set is safe to use for file names [A-F0-9].
    Base64 character set contains slashes(/) which can break file names.
    '''
    return etag.encode('utf-8').hex()
thinkdataworks / namara-python python

Products

About

Resources

Contact Gemfury