import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
x = np.array([1, 2, 3, 4])
y = np.array([11, 22, 33, 44])
plt.plot(x, y);
# 1st Method – pyplot API
fig = plt.figure()
ax = fig.add_subplot()
ax.plot(x,y);
# 2nd Method – Object-oriented API (recommended)
x = np.array([1, 2, 3, 4, 5])
fig, ax = plt.subplots()
ax.plot(x, np.random.randint(25, 250, size=(5,3)));
# 0. Import matlib
# 1. Prepare data
x = np.array([1, 2, 3, 4, 5])
y = np.array([11, 22, 33, 44, 55])
# 2. Setup plot
fig, ax = plt.subplots(figsize=(5,5))
# 3. Plot data
ax.plot(x, y)
# 4. Customize plot
ax.set(title="Simple Plot",
xlabel="Index",
ylabel="Values")
# 5. Save & Share
fig.savefig('plots/HelloPlot.png')
x = np.linspace(0.1, 10, 100) # start & stop inclusive
x
array([ 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1. , 1.1, 1.2, 1.3, 1.4, 1.5, 1.6, 1.7, 1.8, 1.9, 2. , 2.1, 2.2, 2.3, 2.4, 2.5, 2.6, 2.7, 2.8, 2.9, 3. , 3.1, 3.2, 3.3, 3.4, 3.5, 3.6, 3.7, 3.8, 3.9, 4. , 4.1, 4.2, 4.3, 4.4, 4.5, 4.6, 4.7, 4.8, 4.9, 5. , 5.1, 5.2, 5.3, 5.4, 5.5, 5.6, 5.7, 5.8, 5.9, 6. , 6.1, 6.2, 6.3, 6.4, 6.5, 6.6, 6.7, 6.8, 6.9, 7. , 7.1, 7.2, 7.3, 7.4, 7.5, 7.6, 7.7, 7.8, 7.9, 8. , 8.1, 8.2, 8.3, 8.4, 8.5, 8.6, 8.7, 8.8, 8.9, 9. , 9.1, 9.2, 9.3, 9.4, 9.5, 9.6, 9.7, 9.8, 9.9, 10. ])
fig, ax = plt.subplots()
ax.plot(x, x**2);
fig, ax = plt.subplots()
ax.scatter(x, np.exp(x));
fig, ax = plt.subplots()
ax.scatter(x, np.sin(x));
tea_prices = {"Oolong Tea": 4.75,
"Jasmine Tea": 5.50,
"Green Tea": 5.00,
"Matcha": 6.00,
"Hot Cocoa": 6.25}
fig, ax = plt.subplots()
ax.bar(
tea_prices.keys(),
tea_prices.values())
ax.set(title="Iroh's Tea Shop",
xlabel="Drink",
ylabel="Price ($)");
Food stats are liters of water consumed in production of 1kg (unless labeled single). Drinks are per 250 mL serving
water_required_to_produce_food = {"Chocolate": 17196,
"Beef": 15415,
"Sheep Meat": 10412,
"Pork": 5988,
"Butter": 5553,
"Chicken meat": 4325,
"Cheese": 3178,
"Olives": 3025,
"Rice": 2497,
"Dry Pasta": 1849,
"Bread": 1608,
"Pizza (Single)": 1239,
"Apple": 822,
"Banana": 790,
"Potatoes": 287,
"Milk": 255,
"Cabbage": 237,
"Tomato": 214,
"Egg (Single)": 196,
"Wine": 109,
"Beer": 74}
fig, ax = plt.subplots(figsize=(12,8))
ax.barh(
list(water_required_to_produce_food.keys()),
list(water_required_to_produce_food.values()))
ax.invert_yaxis()
ax.set(
title="How much water is needed to produce 1kg of food?",
xlabel="Water Consumed (L)",
ylabel="Food");
# fig.savefig('./plots/water-for-food-production.jpg')
x = np.random.randn(128)
fig, ax = plt.subplots()
ax.hist(x);
2 Ways to Declare Axes: Implicit & Explicit
Implicit — fig, ax = plt.subplots(nrows=2, ncols=2)
Explicit — fig, ((ax1, ax2), (ax3, ax4)) = plt.subplots(nrows=2, ncols=2)
# Explicit
fig, ((ax1, ax2), (ax3, ax4)) = plt.subplots(nrows=2, ncols=2, figsize=(10,5))
ax1.plot(x, abs(x)/2)
ax2.scatter(
np.random.random(10),
np.random.random(10))
ax3.bar(
tea_prices.keys(),
tea_prices.values())
ax4.hist(x)
(array([ 3., 6., 11., 14., 26., 29., 24., 9., 3., 3.]), array([-2.83220509, -2.26084055, -1.68947602, -1.11811149, -0.54674695, 0.02461758, 0.59598212, 1.16734665, 1.73871119, 2.31007572, 2.88144026]), <BarContainer object of 10 artists>)
# Implicit
fig, ax = plt.subplots(nrows=2, ncols=2, figsize=(10,5))
ax[0,0].plot(x, abs(x)/2)
ax[0,1].scatter(
np.random.random(10),
np.random.random(10))
ax[1,0].bar(
tea_prices.keys(),
tea_prices.values())
ax[1,1].hist(x);
timeseries = pd.Series(np.random.randn(365),
index=pd.date_range("1/1/2021", periods=365))
timeseries
2021-01-01 0.198657 2021-01-02 0.811075 2021-01-03 -0.092062 2021-01-04 -2.006635 2021-01-05 0.218282 ... 2021-12-27 -0.196479 2021-12-28 1.244913 2021-12-29 0.314448 2021-12-30 -0.464284 2021-12-31 -1.507438 Freq: D, Length: 365, dtype: float64
timeseries.plot();
timeseries.cumsum().plot();
simple_series = np.array([1, 2, 3, 4, 5])
cumsum_example = pd.DataFrame(simple_series, columns=["Seq"])
cumsum_example["Cum"] = cumsum_example["Seq"].cumsum()
cumsum_example
Seq | Cum | |
---|---|---|
0 | 1 | 1 |
1 | 2 | 3 |
2 | 3 | 6 |
3 | 4 | 10 |
4 | 5 | 15 |
cars = pd.read_csv('data/cars.csv')
cars
Make | Colour | Odometer | Doors | Price | |
---|---|---|---|---|---|
0 | Toyota | White | 150043 | 4 | $4,000.00 |
1 | Honda | Red | 87899 | 4 | $5,000.00 |
2 | Toyota | Blue | 32549 | 3 | $7,000.00 |
3 | BMW | Black | 11179 | 5 | $22,000.00 |
4 | Nissan | White | 213095 | 4 | $3,500.00 |
5 | Toyota | Green | 99213 | 4 | $4,500.00 |
6 | Honda | Blue | 45698 | 4 | $7,500.00 |
7 | Honda | Blue | 54738 | 4 | $7,000.00 |
8 | Toyota | White | 60000 | 4 | $6,250.00 |
9 | Nissan | White | 31600 | 4 | $9,700.00 |
if (cars["Price"].dtype == 'object'):
cars["Price"] = cars["Price"].str.replace('[$,]', '', regex=True).astype(float)
cars.head()
Make | Colour | Odometer | Doors | Price | |
---|---|---|---|---|---|
0 | Toyota | White | 150043 | 4 | 4000.0 |
1 | Honda | Red | 87899 | 4 | 5000.0 |
2 | Toyota | Blue | 32549 | 3 | 7000.0 |
3 | BMW | Black | 11179 | 5 | 22000.0 |
4 | Nissan | White | 213095 | 4 | 3500.0 |
cars["Sale Date"] = pd.date_range("1/1/2020", periods=len(cars), freq="W")
cars
Make | Colour | Odometer | Doors | Price | Sale Date | |
---|---|---|---|---|---|---|
0 | Toyota | White | 150043 | 4 | 4000.0 | 2020-01-05 |
1 | Honda | Red | 87899 | 4 | 5000.0 | 2020-01-12 |
2 | Toyota | Blue | 32549 | 3 | 7000.0 | 2020-01-19 |
3 | BMW | Black | 11179 | 5 | 22000.0 | 2020-01-26 |
4 | Nissan | White | 213095 | 4 | 3500.0 | 2020-02-02 |
5 | Toyota | Green | 99213 | 4 | 4500.0 | 2020-02-09 |
6 | Honda | Blue | 45698 | 4 | 7500.0 | 2020-02-16 |
7 | Honda | Blue | 54738 | 4 | 7000.0 | 2020-02-23 |
8 | Toyota | White | 60000 | 4 | 6250.0 | 2020-03-01 |
9 | Nissan | White | 31600 | 4 | 9700.0 | 2020-03-08 |
cars["Total Sales"] = cars["Price"].cumsum()
cars
Make | Colour | Odometer | Doors | Price | Sale Date | Total Sales | |
---|---|---|---|---|---|---|---|
0 | Toyota | White | 150043 | 4 | 4000.0 | 2020-01-05 | 4000.0 |
1 | Honda | Red | 87899 | 4 | 5000.0 | 2020-01-12 | 9000.0 |
2 | Toyota | Blue | 32549 | 3 | 7000.0 | 2020-01-19 | 16000.0 |
3 | BMW | Black | 11179 | 5 | 22000.0 | 2020-01-26 | 38000.0 |
4 | Nissan | White | 213095 | 4 | 3500.0 | 2020-02-02 | 41500.0 |
5 | Toyota | Green | 99213 | 4 | 4500.0 | 2020-02-09 | 46000.0 |
6 | Honda | Blue | 45698 | 4 | 7500.0 | 2020-02-16 | 53500.0 |
7 | Honda | Blue | 54738 | 4 | 7000.0 | 2020-02-23 | 60500.0 |
8 | Toyota | White | 60000 | 4 | 6250.0 | 2020-03-01 | 66750.0 |
9 | Nissan | White | 31600 | 4 | 9700.0 | 2020-03-08 | 76450.0 |
cars.plot(x="Sale Date", y="Total Sales");
cars.plot(x="Odometer", y="Price", kind="scatter");
x = np.random.rand(10, 4)
df = pd.DataFrame(x, columns=['a', 'b', 'c', 'd'])
df
a | b | c | d | |
---|---|---|---|---|
0 | 0.056376 | 0.840925 | 0.440115 | 0.253111 |
1 | 0.139327 | 0.537898 | 0.288644 | 0.902088 |
2 | 0.212609 | 0.198575 | 0.240101 | 0.202547 |
3 | 0.829980 | 0.686096 | 0.713809 | 0.686075 |
4 | 0.835714 | 0.265893 | 0.787042 | 0.348383 |
5 | 0.837753 | 0.897795 | 0.268935 | 0.001020 |
6 | 0.998334 | 0.812969 | 0.642296 | 0.989324 |
7 | 0.113705 | 0.100756 | 0.614547 | 0.921541 |
8 | 0.943935 | 0.439403 | 0.928927 | 0.031700 |
9 | 0.749540 | 0.182308 | 0.634020 | 0.845659 |
df.plot.bar();
df.plot(kind="bar");
avg_odometer_by_make = cars.groupby(["Make"]).mean()["Odometer"]
avg_odometer_by_make
Make BMW 11179.000000 Honda 62778.333333 Nissan 122347.500000 Toyota 85451.250000 Name: Odometer, dtype: float64
avg_odometer_by_make.plot(kind="bar");
cars["Odometer"].plot(kind="hist", bins=10); # 10 bins by default
heart_disease = pd.read_csv("./data/heart-disease.csv")
heart_disease.head()
age | sex | cp | trestbps | chol | fbs | restecg | thalach | exang | oldpeak | slope | ca | thal | target | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 63 | 1 | 3 | 145 | 233 | 1 | 0 | 150 | 0 | 2.3 | 0 | 0 | 1 | 1 |
1 | 37 | 1 | 2 | 130 | 250 | 0 | 1 | 187 | 0 | 3.5 | 0 | 0 | 2 | 1 |
2 | 41 | 0 | 1 | 130 | 204 | 0 | 0 | 172 | 0 | 1.4 | 2 | 0 | 2 | 1 |
3 | 56 | 1 | 1 | 120 | 236 | 0 | 1 | 178 | 0 | 0.8 | 2 | 0 | 2 | 1 |
4 | 57 | 0 | 0 | 120 | 354 | 0 | 1 | 163 | 1 | 0.6 | 2 | 0 | 2 | 1 |
heart_disease["age"].plot(kind="hist", bins=20);
heart_disease.plot.hist(figsize=(20,24), subplots=True);
over_50 = heart_disease[heart_disease["age"] > 50]
over_50
age | sex | cp | trestbps | chol | fbs | restecg | thalach | exang | oldpeak | slope | ca | thal | target | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 63 | 1 | 3 | 145 | 233 | 1 | 0 | 150 | 0 | 2.3 | 0 | 0 | 1 | 1 |
3 | 56 | 1 | 1 | 120 | 236 | 0 | 1 | 178 | 0 | 0.8 | 2 | 0 | 2 | 1 |
4 | 57 | 0 | 0 | 120 | 354 | 0 | 1 | 163 | 1 | 0.6 | 2 | 0 | 2 | 1 |
5 | 57 | 1 | 0 | 140 | 192 | 0 | 1 | 148 | 0 | 0.4 | 1 | 0 | 1 | 1 |
6 | 56 | 0 | 1 | 140 | 294 | 0 | 0 | 153 | 0 | 1.3 | 1 | 0 | 2 | 1 |
... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
297 | 59 | 1 | 0 | 164 | 176 | 1 | 0 | 90 | 0 | 1.0 | 1 | 2 | 1 | 0 |
298 | 57 | 0 | 0 | 140 | 241 | 0 | 1 | 123 | 1 | 0.2 | 1 | 0 | 3 | 0 |
300 | 68 | 1 | 0 | 144 | 193 | 1 | 1 | 141 | 0 | 3.4 | 1 | 2 | 3 | 0 |
301 | 57 | 1 | 0 | 130 | 131 | 0 | 1 | 115 | 1 | 1.2 | 1 | 1 | 3 | 0 |
302 | 57 | 0 | 1 | 130 | 236 | 0 | 0 | 174 | 0 | 0.0 | 1 | 1 | 2 | 0 |
208 rows × 14 columns
# Pyplot Method
over_50.plot(kind='scatter', figsize=(12, 8),
x='age',
y='chol',
c='target');
# OO Method
fig, ax = plt.subplots(figsize=(12, 8))
# ax.set_xlim([45, 100]);
over_50.plot(kind='scatter',
x='age',
y='chol',
c='target',
ax=ax)
<AxesSubplot:xlabel='age', ylabel='chol'>
# Pure OO Method
fig, ax = plt.subplots(figsize=(12, 8))
# Plot the data
heart_disease_scatter = ax.scatter(x=over_50["age"],
y=over_50["chol"],
c=over_50["target"])
# Customize the labels
ax.set(title="Heart Disease and Cholestrol Levels",
xlabel="Age",
ylabel="Cholestrol (mg/dL)")
# Add a legend
ax.legend(*heart_disease_scatter.legend_elements(), title="Target");
# Add a line to show avg cholestrol level
ax.axhline(over_50["chol"].mean(),
linestyle='--');
# Subplot of chol & thalach by age
fig, (ax0, ax1) = plt.subplots(nrows=2,
ncols=1,
sharex=True,
figsize=(10,10))
# Plot 1: Cholestrol by Age
hd_plot1 = ax0.scatter(x=over_50["age"],
y=over_50["chol"],
c=over_50["target"])
ax0.set(title="Cholestrol Levels",
ylabel="Cholestrol")
ax0.legend(*hd_plot1.legend_elements(), title="Heart Disease")
ax0.axhline(over_50["chol"].mean(),
linestyle='--')
# Plot 2: Thalach by Age
hd_plot2 = ax1.scatter(x=over_50["age"],
y=over_50["thalach"],
c=over_50["target"])
ax1.set(title="Max Heart Rate",
xlabel="Age",
ylabel="Max Heart Rate (bpm)")
ax1.legend(*hd_plot2.legend_elements(), title="Heart Disease")
ax1.axhline(over_50["thalach"].mean(),
linestyle='--')
# Title the figure
fig.suptitle("Heart Disease Analysis", fontsize=16, fontweight="bold");
plt.style.available
['Solarize_Light2', '_classic_test_patch', 'bmh', 'classic', 'dark_background', 'fast', 'fivethirtyeight', 'ggplot', 'grayscale', 'seaborn', 'seaborn-bright', 'seaborn-colorblind', 'seaborn-dark', 'seaborn-dark-palette', 'seaborn-darkgrid', 'seaborn-deep', 'seaborn-muted', 'seaborn-notebook', 'seaborn-paper', 'seaborn-pastel', 'seaborn-poster', 'seaborn-talk', 'seaborn-ticks', 'seaborn-white', 'seaborn-whitegrid', 'tableau-colorblind10']
plt.style.use('seaborn')
cars["Price"].plot();
x = np.random.randn(10,4)
df = pd.DataFrame(x, columns=['a', 'b', 'c', 'd'])
ax = df.plot(kind='bar')
ax.set(title="Random Numbers",
xlabel="Index",
ylabel="Value")
ax.legend().set_visible(True);
# Set style
plt.style.use('seaborn-whitegrid')
# Pure OO Method
fig, ax = plt.subplots(figsize=(10, 6))
# Plot the data
heart_disease_scatter = ax.scatter(x=over_50["age"],
y=over_50["chol"],
c=over_50["target"],
cmap='winter')
# Customize the labels
ax.set(title="Cholestrol Levels and Heart Disease",
xlabel="Age",
ylabel="Cholestrol (mg/dL)")
# Add a legend
ax.legend(*heart_disease_scatter.legend_elements(), title="Heart Disease");
# Add a line to show avg cholestrol level
ax.axhline(over_50["chol"].mean(),
linestyle='--');
# Subplot of chol & thalach by age
fig, (ax0, ax1) = plt.subplots(nrows=2,
ncols=1,
sharex=True,
figsize=(10,10))
# Plot 1: Cholestrol by Age
hd_plot1 = ax0.scatter(x=over_50["age"],
y=over_50["chol"],
c=over_50["target"],
cmap='winter')
ax0.set(title="Cholestrol Levels",
ylabel="Cholestrol")
ax0.set_xlim([50,80])
ax0.set_ylim([100,600])
ax0.legend(*hd_plot1.legend_elements(), title="Heart Disease")
ax0.axhline(over_50["chol"].mean(),
linestyle='--')
# Plot 2: Thalach by Age
hd_plot2 = ax1.scatter(x=over_50["age"],
y=over_50["thalach"],
c=over_50["target"],
cmap='winter')
ax1.set(title="Max Heart Rate",
xlabel="Age",
ylabel="Max Heart Rate (bpm)")
ax1.set_ylim([60,200])
ax1.legend(*hd_plot2.legend_elements(), title="Heart Disease")
ax1.axhline(over_50["thalach"].mean(),
linestyle='--')
# Title the figure
fig.suptitle("Heart Disease Analysis", fontsize=16, fontweight="bold");
fig.savefig("./plots/heart-disease-analysis.jpg")