Different Ways to Load Data From CSV Files in PostgreSQL

#postgres #python #sql #programming

Populating data

First of all make sure that your user has SUPERUSER privileges.
You can set this using.

ALTER ROLE <user> WITH SUPERUSER;

Here we'll see three ways to populate our data

using inbuilt csv module
using SQL CALL
using pandas

1. Using CSV module

import psycopg2 
import csv

connection = psycopg2.connect(
    database="store", 
    user="postgres",
    password="lgwmfpsc", 
    host="localhost",
    port="5432", # default post
    )

cursor = connection.cursor()

cursor.execute("DROP TABLE IF EXISTS superstore;")
connection.commit()

create_query = '''CREATE TABLE superstore (
    id INT PRIMARY KEY,
    ship_model VARCHAR(255) NOT NULL,
    segment VARCHAR(255) NOT NULL,
    country VARCHAR(255) NOT NULL,
    city VARCHAR(255) NOT NULL,
    state VARCHAR(255) NOT NULL,
    postal_code INT NOT NULL,
    region VARCHAR(255) NOT NULL,
    category VARCHAR(255) NOT NULL,
    subcategory VARCHAR(255) NOT NULL,
    sales FLOAT NOT NULL,
    quantity INT NOT NULL,
    discount FLOAT NOT NULL,
    profit FLOAT NOT NULL
    )
'''

cursor.execute(create_query)
connection.commit()

with open('./SampleSuperstore.csv') as file:
    id = 1
    csv_data = csv.reader(file)
    next(csv_data)
    for row in csv_data:
        row = [id] + row
        row_tuple = tuple(row)
        cursor.execute('''INSERT INTO superstore (id, ship_model, segment, country, city, state, postal_code,
                       region, category, subcategory, sales, quantity, discount, profit)
                       VALUES (%s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s)''', row_tuple)
        id += 1

connection.commit()

cursor.execute("SELECT * FROM superstore LIMIT 10")
print(cursor.fetchall())

connection.close()

Before iterating throught the list next is used to skip the first line of the file as they are headers.

2. Using SQL CALL

import psycopg2
import csv

connection = psycopg2.connect(
    user = 'postgres',
    password = 'lgwmfpsc', 
    host = 'localhost',
    database = 'store', 
    port="5432"
    )

cursor = connection.cursor()

create_query = '''CREATE TABLE superstore (
    id INT(255) NOT NULL AUTO_INCREMENT,
    ship_model VARCHAR(255) NOT NULL,
    segment VARCHAR(255) NOT NULL,
    country VARCHAR(255) NOT NULL,
    city VARCHAR(255) NOT NULL,
    state VARCHAR(255) NOT NULL,
    postal_code INT NOT NULL,
    region VARCHAR(255) NOT NULL,
    category VARCHAR(255) NOT NULL,
    subcategory VARCHAR(255) NOT NULL,
    sales FLOAT NOT NULL,
    quantity INT NOT NULL,
    discount FLOAT NOT NULL,
    profit FLOAT NOT NULL,
    PRIMARY KEY(id))
'''



cursor.execute("DROP TABLE IF EXISTS superstore")
cursor.execute(create_query)

q = '''LOAD DATA LOCAL INFILE '/home/sapan/codes/Linkedin-Python/database-python/csv-database/SampleSuperstore.csv'
    INTO TABLE superstore
    FIELDS TERMINATED BY ',' ENCLOSED BY "" (ship_model, segment, country, city, state, postal_code,
    region, category, subcategory, sales, quantity, discount, profit);'''

cursor.execute(q)

connection.commit()

cursor.execute("SELECT * FROM superstore LIMIT 10")
print(cursor.fetchall())

connection.close()

3. Usinng pandas

# using pandas with sqlalchemy

import pandas as pd
from sqlalchemy import Column, Integer, String, DateTime, Float, Boolean
from sqlalchemy import create_engine, func
from sqlalchemy.orm import sessionmaker, relationship
from sqlalchemy.ext.declarative import declarative_base

engine = create_engine('postgres://postgres:lgwmfpsc@localhost/store')

Base = declarative_base()

class SalesReport(Base):
    __tablename__ = 'salesreport'

    id = Column(Integer, primary_key=True)
    country = Column("Country", String(length=50))
    state = Column("State", String(length=50))
    city = Column("City", String(length=50))
    postalcode = Column("Postal Code", String(length=50))

    def __repr__(self):
        return f'''<Salesreport(id='{self.country}', country='{self.country}', state='{self.state}', postal_code='{self.postalcode}')>
    '''    

Base.metadata.create_all(engine)

file_name = './sample_data.csv'
data_frame = pd.read_csv(file_name)

# # the below function call automatically import the data into our database, so we do not
# # to work with the session here
data_frame.to_sql(
    con = engine,
    name = SalesReport.__tablename__, 
    if_exists='append', 
    index = False
    )

# # however if we want to prove the data was imported, we need session here
session = sessionmaker()
session.configure(bind=engine)
s = session()

results = s.query(SalesReport).limit(10).all()
for i, r in enumerate(results):
    print(i, r)
    print()


overall_max = s.query(func.max(SalesReport.postalcode)).scalar
print(overall_max) 

results = s.query(SalesReport).order_by(SalesReport.postalcode.desc()).limit(10)
for row in results:
    print(results)

In this post we only see how to populate our data. If you want to view completed details how to use postgreSQL with python refers to this article.
https://sapanravidas.medium.com/postgres-with-python3-ec3cafc5c98d

DEV Community

Different Ways to Load Data From CSV Files in PostgreSQL

Populating data

Here we'll see three ways to populate our data

1. Using CSV module

2. Using SQL CALL

3. Usinng pandas

END

Top comments (0)

Read next

Making custom tools

PySpark： missing value

988. Smallest String Starting From Leaf

Leveraging Python's Built-In Decorator for Improved Performance