Amazon Redshift
- 1.Navigate to: Resources > Data Sources
- 2.Select the Amazon Redshift Icon:

- Name (required)
- Description (optional)
- Host (required)
- Redshift Cluster Endpoint
- Do not add Port or Database to the end of the host url
- Example:
redshift-cluster.sda23451.us-west-1.redshift.amazonaws.com
- Port (required)
- Database (optional)
- Username (required - can be entered later)
- A master username and password will be set upon creating the Redshift database
- Password (required - can be entered later)
Python
Scala
PySpark
R
SQL
%python
import pandas as pd
# Open a connection and create a cursor object to perform database operations
conn = z.getDatasource("zepl_docs_redshift")
cur = conn.cursor()
### Option 1: Read into Python Pandas DataFrame object ###
df = pd.read_sql_query("SELECT * FROM zepl_doc;", conn)
### Option 2: Read into Python list object ###
cur.execute("SELECT * FROM zepl_doc;")
res = cur.fetchall()
# Display Results from dataframe (df) or list object (res)
z.show(df)
# Make the changes to the database persistent
conn.commit()
# Close communication with the database
cur.close()
conn.close()
%spark
val df = z.getDatasource("zepl_docs_redshift")
.asInstanceOf[org.apache.spark.sql.DataFrameReader]
.option("dbtable", "<table name>")
.load()
z.show(df)
Option 1: Using Python to connect to Redshift:
%pyspark
import pandas as pd
#Open a connection and create a cursor object to perform database operations
conn = z.getDatasource("zepl_docs_redshift")
cur = conn.cursor()
### Option 1: Read into Python Pandas DataFrame object ###
df = pd.read_sql_query("SELECT * FROM zepl_doc;", conn)
### Option 2: Read into Python list object ###
cur.execute("SELECT * FROM zepl_doc;")
res = cur.fetchall()
# Display Results from dataframe (df) or list object (res)
z.show(df)
# Make the changes to the database persistent
conn.commit()
# Close communication with the database
cur.close()
conn.close()
Option 2: Using Scala to connect to Redshift and Reading temp table in Pyspark
%spark
val df = z.getDatasource("zepl_docs_redshift")
.asInstanceOf[org.apache.spark.sql.DataFrameReader]
.option("dbtable", "zepl_doc")
.load()
// Un comment if need Pyspark to use SparkSQL DataFrame
df.registerTempTable("pass_to_pyspark")
%pyspark
# Read dataframe from Scala
df = sqlContext.sql("select * from pass_to_pyspark")
z.show(df)
First, Install required packages and make connection to Redshift
%r
# Reference Doc: https://aws.amazon.com/blogs/big-data/connecting-r-with-amazon-redshift/
install.packages("RJDBC")
library(RJDBC)
# download Amazon Redshift JDBC driver
download.file('http://s3.amazonaws.com/redshift-downloads/drivers/RedshiftJDBC41-1.1.9.1009.jar','RedshiftJDBC41-1.1.9.1009.jar')
# connect to Amazon Redshift
driver <- JDBC("com.amazon.redshift.jdbc41.Driver", "RedshiftJDBC41-1.1.9.1009.jar", identifier.quote="`")
# Fill in the values below: "jdbc:redshift://<JDBCURL>:<PORT>/<DBNAME>?user=<USER>&password=<PW>"
url <- "jdbc:redshift://redshift-cluster.sdcedksis89.us-west-1.redshift.amazonaws.com:5439/dev?user=<username>&password=<password>"
conn <- dbConnect(driver, url)
Then, Execute queries
%r
res = dbGetQuery(conn, "select * from zepl_doc")
z.show(res)
%datasource.zepl_docs_redshift
SELECT * FROM zepl_doc
Python
R
%python
#Open a connection and create a cursor object to perform database operations
conn = z.getDatasource("zepl_docs_redshift")
cur = conn.cursor()
# Create table called zepl_doc if not exists
cur.execute(""" create table IF NOT exists zepl_doc(
salesid integer not null,
listid varchar not null distkey) """)
# Insert data (100, "xyz") into the table zepl_doc
cur.execute("INSERT INTO zepl_doc (salesid, listid) VALUES (%s, %s)", (100, "xyz"))
# Persist changes to the database
conn.commit()
# Close communication with the database
cur.close()
conn.close()
%r
# Reference Doc: https://aws.amazon.com/blogs/big-data/connecting-r-with-amazon-redshift/
install.packages("RJDBC")
library(RJDBC)
# download Amazon Redshift JDBC driver
download.file('http://s3.amazonaws.com/redshift-downloads/drivers/RedshiftJDBC41-1.1.9.1009.jar','RedshiftJDBC41-1.1.9.1009.jar')
# connect to Amazon Redshift
driver <- JDBC("com.amazon.redshift.jdbc41.Driver", "RedshiftJDBC41-1.1.9.1009.jar", identifier.quote="`")
# Fill in the values below: "jdbc:redshift://<JDBCURL>:<PORT>/<DBNAME>?user=<USER>&password=<PW>"
url <- "jdbc:redshift://redshift-cluster.sdcedksis89.us-west-1.redshift.amazonaws.com:5439/dev?user=<username>&password=<password>"
conn <- dbConnect(driver, url)
%r
# Append values from the res object into zepl_doc table in Redshift
dbWriteTable(conn, "zepl_doc", res, append='True')
- Redshift must be accessible to Zepl's whitelisted IP addresses. This should be done as a Security Group in the AWS Console.
- The user associated with your user name and password must have access to the desired databases and tables
- User should have read and write permissions on the desired databases and tables
AWS is constantly evolving - please review their documentation to ensure best practice
Last modified 2yr ago