I use the shops_df dataframe to create a shop_id column in the treasures dataframe. I do this by setting the shop_name in shops_df as an index and getting the shop_ids by shop_name
# get the shop_ids by 'shop_name' from shops and build a 'shop_id' column for treasures
# the getter on shop's 'shop_id' column can be used as a callback to achieve this
shop_ids = shops_df.set_index('shop_name')['shop_id']
treasures_df['shop_id'] = treasures_df['shop'].apply(shop_ids.get)
In the original code. The shops table was created first. It was then queried with a ‘select * from shops’ and the relative shop_ids were extracted from that to create the shop_id column in the treasures table. I’m thinking this is better practice than creating the shop_ids ahead of time.
I would be interested in feedback on that, and why my approach could be problematic.
''' Description: Seed the database with data from JSON files '''
import pandas as pd
from .connection import CreateConnection
from ..dependencies import ROOT_PATH
from ..utils.utils import load_json_data
from .tables_sql import (
DROP_TABLES,
SHOPS_SCHEMA,
TREASURES_SCHEMA,
POPULATE_SHOPS,
POPULATE_TREASURES
)
def fetch_shops(shops_path: str) -> pd.DataFrame:
'''load and prepare shops table'''
shops_json = load_json_data(shops_path)['shops']
# create a dataframe and process the data
shops_df = pd.DataFrame(shops_json)
shops_df['shop_id'] = pd.RangeIndex(1, len(shops_json) + 1)
return shops_df
def fetch_treasures(treasures_path: str, shops_df: pd.DataFrame) -> pd.DataFrame:
'''load and prepare treasures table'''
treasures_json = load_json_data(treasures_path)['treasures']
# create a dataframe and process the data
treasures_df = pd.DataFrame(treasures_json)
# get the shop_ids by 'shop_name' from shops and build a 'shop_id' column for treasures
# the getter on shop's 'shop_id' column can be used as a callback to achieve this
shop_ids = shops_df.set_index('shop_name')['shop_id']
treasures_df['shop_id'] = treasures_df['shop'].apply(shop_ids.get)
return treasures_df
def seed_db(env:str = 'test') -> None:
'''Seed the database'''
print('Seeding Database...\n')
# with pg8000.dbapi we need to set pg8000.paramstyle to 'named' for :named placeholders
with CreateConnection(paramstyle='named') as conn:
cursor = conn.cursor()
shops_path = f'{ROOT_PATH}/data/{env}-data/shops.json'
treasures_path = f'{ROOT_PATH}/data/{env}-data/treasures.json'
cursor.execute(DROP_TABLES)
cursor.execute(SHOPS_SCHEMA)
cursor.execute(TREASURES_SCHEMA)
shops_df = fetch_shops(shops_path)
treasures_df = fetch_treasures(treasures_path, shops_df)
# arguments for placeholders are passed in as a list of dictionaries
cursor.executemany(POPULATE_SHOPS, shops_df.to_dict('records'))
cursor.executemany(POPULATE_TREASURES, treasures_df.to_dict('records'))
print('Database seeded!')
SQL
DROP_TABLES = '''
DROP TABLE if exists treasures;
DROP TABLE if exists shops;
'''
SHOPS_SCHEMA = '''
CREATE TABLE shops (
shop_id SERIAL PRIMARY KEY,
shop_name VARCHAR(42) NOT NULL,
owner VARCHAR(42),
slogan VARCHAR (256)
)
'''
TREASURES_SCHEMA = '''
CREATE TABLE treasures (
treasure_id SERIAL PRIMARY KEY,
treasure_name VARCHAR(256) NOT NULL,
colour VARCHAR (42),
age INT,
cost_at_auction FLOAT(2),
shop_id INT REFERENCES shops(shop_id)
)
'''
POPULATE_SHOPS = '''
INSERT INTO shops (shop_name, owner, slogan)
VALUES (:shop_name, :owner, :slogan)
'''
POPULATE_TREASURES = '''
INSERT INTO treasures (treasure_name, colour, age, cost_at_auction, shop_id)
VALUES (:treasure_name, :colour, :age,:cost_at_auction, :shop_id)
'''
Your approach could be problematic in the case where the shops table already contains data when you start the import. Because then the first one to insert wouldn’t be ID 1.
That being said, when the database is empty, I prefer your approach over the original approach, as you already have all data available in memory, there is no need to ask the database things you already know.
Incidentally, if you were to use UUIDs rather than integers as IDs, the first problem would go away completely.