import pandas as pd
Pandas has 2 main datatypes:
series = pd.Series(["BMW", "Toyota", "Honda"])
series
0 BMW 1 Toyota 2 Honda dtype: object
colors = pd.Series(["Red", "White", "Blue"])
car_data = pd.DataFrame({"Car Make": series, "Color": colors})
car_data
Car Make | Color | |
---|---|---|
0 | BMW | Red |
1 | Toyota | White |
2 | Honda | Blue |
cars = pd.read_csv("./data/cars.csv")
cars
# You can also import a csv directly from a URL
Make | Colour | Odometer | Doors | Price | |
---|---|---|---|---|---|
0 | Toyota | White | 150043 | 4 | $4,000.00 |
1 | Honda | Red | 87899 | 4 | $5,000.00 |
2 | Toyota | Blue | 32549 | 3 | $7,000.00 |
3 | BMW | Black | 11179 | 5 | $22,000.00 |
4 | Nissan | White | 213095 | 4 | $3,500.00 |
5 | Toyota | Green | 99213 | 4 | $4,500.00 |
6 | Honda | Blue | 45698 | 4 | $7,500.00 |
7 | Honda | Blue | 54738 | 4 | $7,000.00 |
8 | Toyota | White | 60000 | 4 | $6,250.00 |
9 | Nissan | White | 31600 | 4 | $9,700.00 |
Anatomy of a DataFrame
# Exporting a dataframe
cars.to_csv("./data/exported-cars.csv", index=False)
cars.dtypes
Make object Colour object Odometer int64 Doors int64 Price object dtype: object
cars.columns
Index(['Make', 'Colour', 'Odometer', 'Doors', 'Price'], dtype='object')
cars.index
RangeIndex(start=0, stop=10, step=1)
cars.describe()
Odometer | Doors | |
---|---|---|
count | 10.000000 | 10.000000 |
mean | 78601.400000 | 4.000000 |
std | 61983.471735 | 0.471405 |
min | 11179.000000 | 3.000000 |
25% | 35836.250000 | 4.000000 |
50% | 57369.000000 | 4.000000 |
75% | 96384.500000 | 4.000000 |
max | 213095.000000 | 5.000000 |
# dataframe.info() provides info from df.dtypes & df.index
cars.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 10 entries, 0 to 9 Data columns (total 5 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 Make 10 non-null object 1 Colour 10 non-null object 2 Odometer 10 non-null int64 3 Doors 10 non-null int64 4 Price 10 non-null object dtypes: int64(2), object(3) memory usage: 528.0+ bytes
len(cars)
10
# Selecting a specific column
cars["Odometer"].sum()
786014
cars.head()
Make | Colour | Odometer | Doors | Price | |
---|---|---|---|---|---|
0 | Toyota | White | 150043 | 4 | $4,000.00 |
1 | Honda | Red | 87899 | 4 | $5,000.00 |
2 | Toyota | Blue | 32549 | 3 | $7,000.00 |
3 | BMW | Black | 11179 | 5 | $22,000.00 |
4 | Nissan | White | 213095 | 4 | $3,500.00 |
cars.tail(3)
Make | Colour | Odometer | Doors | Price | |
---|---|---|---|---|---|
7 | Honda | Blue | 54738 | 4 | $7,000.00 |
8 | Toyota | White | 60000 | 4 | $6,250.00 |
9 | Nissan | White | 31600 | 4 | $9,700.00 |
elements = pd.Series(["water", "earth", "fire", "air", "lightning", "metal", "blood", "lava"],
index=[1,2,3,4,3,2,1,2])
elements
1 water 2 earth 3 fire 4 air 3 lightning 2 metal 1 blood 2 lava dtype: object
elements.loc[2] # On a series
2 earth 2 metal 2 lava dtype: object
elements.iloc[2]
'fire'
elements.iloc[4]
'lightning'
# .iloc[:p] get all items up to position p (exclusive)
elements.iloc[:4]
1 water 2 earth 3 fire 4 air dtype: object
cars.loc[3]
Make BMW Colour Black Odometer 11179 Doors 5 Price $22,000.00 Name: 3, dtype: object
# In the case of a dataframe, index matches the position
cars.iloc[:3] # Fetch records up to position 3 (exclusive)
Make | Colour | Odometer | Doors | Price | |
---|---|---|---|---|---|
0 | Toyota | White | 150043 | 4 | $4,000.00 |
1 | Honda | Red | 87899 | 4 | $5,000.00 |
2 | Toyota | Blue | 32549 | 3 | $7,000.00 |
# Filtering by a column
cars[cars["Make"] == "Toyota"]
Make | Colour | Odometer | Doors | Price | |
---|---|---|---|---|---|
0 | Toyota | White | 150043 | 4 | $4,000.00 |
2 | Toyota | Blue | 32549 | 3 | $7,000.00 |
5 | Toyota | Green | 99213 | 4 | $4,500.00 |
8 | Toyota | White | 60000 | 4 | $6,250.00 |
cars[cars["Odometer"] > 75000]
Make | Colour | Odometer | Doors | Price | |
---|---|---|---|---|---|
0 | Toyota | White | 150043 | 4 | $4,000.00 |
1 | Honda | Red | 87899 | 4 | $5,000.00 |
4 | Nissan | White | 213095 | 4 | $3,500.00 |
5 | Toyota | Green | 99213 | 4 | $4,500.00 |
pd.crosstab(cars["Make"], cars["Doors"])
Doors | 3 | 4 | 5 |
---|---|---|---|
Make | |||
BMW | 0 | 0 | 1 |
Honda | 0 | 3 | 0 |
Nissan | 0 | 2 | 0 |
Toyota | 1 | 3 | 0 |
cars.groupby(["Make"]).mean()
Odometer | Doors | |
---|---|---|
Make | ||
BMW | 11179.000000 | 5.00 |
Honda | 62778.333333 | 4.00 |
Nissan | 122347.500000 | 4.00 |
Toyota | 85451.250000 | 3.75 |
%matplotlib inline
import matplotlib.pyplot as plt
cars["Odometer"].plot()
<AxesSubplot:>
cars["Odometer"].hist()
<AxesSubplot:>
if (cars["Price"].dtype == 'object'):
cars["Price"] = cars["Price"].str.replace('[\$\,]', '', regex=True).astype(float)
cars.head()
Make | Colour | Odometer | Doors | Price | |
---|---|---|---|---|---|
0 | Toyota | White | 150043 | 4 | 4000.0 |
1 | Honda | Red | 87899 | 4 | 5000.0 |
2 | Toyota | Blue | 32549 | 3 | 7000.0 |
3 | BMW | Black | 11179 | 5 | 22000.0 |
4 | Nissan | White | 213095 | 4 | 3500.0 |
cars["Price"].dtype
dtype('float64')
cars["Price"].plot()
<AxesSubplot:>
cars["Make"].str.lower()
0 toyota 1 honda 2 toyota 3 bmw 4 nissan 5 toyota 6 honda 7 honda 8 toyota 9 nissan Name: Make, dtype: object
cars_missing_data = pd.read_csv("./data/cars-missing-data.csv")
cars_missing_data
Make | Colour | Odometer | Doors | Price | |
---|---|---|---|---|---|
0 | Toyota | White | 150043.0 | 4.0 | $4,000 |
1 | Honda | Red | 87899.0 | 4.0 | $5,000 |
2 | Toyota | Blue | NaN | 3.0 | $7,000 |
3 | BMW | Black | 11179.0 | 5.0 | $22,000 |
4 | Nissan | White | 213095.0 | 4.0 | $3,500 |
5 | Toyota | Green | NaN | 4.0 | $4,500 |
6 | Honda | NaN | NaN | 4.0 | $7,500 |
7 | Honda | Blue | NaN | 4.0 | NaN |
8 | Toyota | White | 60000.0 | NaN | NaN |
9 | NaN | White | 31600.0 | 4.0 | $9,700 |
cars_missing_data["Odometer"].fillna(cars_missing_data["Odometer"].mean(), inplace=True)
cars_missing_data
Make | Colour | Odometer | Doors | Price | |
---|---|---|---|---|---|
0 | Toyota | White | 150043.000000 | 4.0 | $4,000 |
1 | Honda | Red | 87899.000000 | 4.0 | $5,000 |
2 | Toyota | Blue | 92302.666667 | 3.0 | $7,000 |
3 | BMW | Black | 11179.000000 | 5.0 | $22,000 |
4 | Nissan | White | 213095.000000 | 4.0 | $3,500 |
5 | Toyota | Green | 92302.666667 | 4.0 | $4,500 |
6 | Honda | NaN | 92302.666667 | 4.0 | $7,500 |
7 | Honda | Blue | 92302.666667 | 4.0 | NaN |
8 | Toyota | White | 60000.000000 | NaN | NaN |
9 | NaN | White | 31600.000000 | 4.0 | $9,700 |
cars_missing_dropped = cars_missing_data.dropna()
cars_missing_dropped
Make | Colour | Odometer | Doors | Price | |
---|---|---|---|---|---|
0 | Toyota | White | 150043.000000 | 4.0 | $4,000 |
1 | Honda | Red | 87899.000000 | 4.0 | $5,000 |
2 | Toyota | Blue | 92302.666667 | 3.0 | $7,000 |
3 | BMW | Black | 11179.000000 | 5.0 | $22,000 |
4 | Nissan | White | 213095.000000 | 4.0 | $3,500 |
5 | Toyota | Green | 92302.666667 | 4.0 | $4,500 |
# Adding column from Pandas Series
seat_count = pd.Series([5, 5, 5, 5, 5, 5]) # Create Series
cars["Seats"] = seat_count # Add series as 'Seats' column to end of dataframe
cars
Make | Colour | Odometer | Doors | Price | Seats | |
---|---|---|---|---|---|---|
0 | Toyota | White | 150043 | 4 | 4000.0 | 5.0 |
1 | Honda | Red | 87899 | 4 | 5000.0 | 5.0 |
2 | Toyota | Blue | 32549 | 3 | 7000.0 | 5.0 |
3 | BMW | Black | 11179 | 5 | 22000.0 | 5.0 |
4 | Nissan | White | 213095 | 4 | 3500.0 | 5.0 |
5 | Toyota | Green | 99213 | 4 | 4500.0 | 5.0 |
6 | Honda | Blue | 45698 | 4 | 7500.0 | NaN |
7 | Honda | Blue | 54738 | 4 | 7000.0 | NaN |
8 | Toyota | White | 60000 | 4 | 6250.0 | NaN |
9 | Nissan | White | 31600 | 4 | 9700.0 | NaN |
cars["Seats"].fillna(5, inplace=True)
cars
Make | Colour | Odometer | Doors | Price | Seats | |
---|---|---|---|---|---|---|
0 | Toyota | White | 150043 | 4 | 4000.0 | 5.0 |
1 | Honda | Red | 87899 | 4 | 5000.0 | 5.0 |
2 | Toyota | Blue | 32549 | 3 | 7000.0 | 5.0 |
3 | BMW | Black | 11179 | 5 | 22000.0 | 5.0 |
4 | Nissan | White | 213095 | 4 | 3500.0 | 5.0 |
5 | Toyota | Green | 99213 | 4 | 4500.0 | 5.0 |
6 | Honda | Blue | 45698 | 4 | 7500.0 | 5.0 |
7 | Honda | Blue | 54738 | 4 | 7000.0 | 5.0 |
8 | Toyota | White | 60000 | 4 | 6250.0 | 5.0 |
9 | Nissan | White | 31600 | 4 | 9700.0 | 5.0 |
# Adding column from Python List
fuel_economy_mpg = [25.3, 19.2, 22.0, 28, 36, 26.9, 24.4, 24.6, 26.5, 29.8] # Length must match dataframe size exactly
cars["Fuel Efficiency"] = fuel_economy_mpg
cars
Make | Colour | Odometer | Doors | Price | Seats | Fuel Efficiency | |
---|---|---|---|---|---|---|---|
0 | Toyota | White | 150043 | 4 | 4000.0 | 5.0 | 25.3 |
1 | Honda | Red | 87899 | 4 | 5000.0 | 5.0 | 19.2 |
2 | Toyota | Blue | 32549 | 3 | 7000.0 | 5.0 | 22.0 |
3 | BMW | Black | 11179 | 5 | 22000.0 | 5.0 | 28.0 |
4 | Nissan | White | 213095 | 4 | 3500.0 | 5.0 | 36.0 |
5 | Toyota | Green | 99213 | 4 | 4500.0 | 5.0 | 26.9 |
6 | Honda | Blue | 45698 | 4 | 7500.0 | 5.0 | 24.4 |
7 | Honda | Blue | 54738 | 4 | 7000.0 | 5.0 | 24.6 |
8 | Toyota | White | 60000 | 4 | 6250.0 | 5.0 | 26.5 |
9 | Nissan | White | 31600 | 4 | 9700.0 | 5.0 | 29.8 |
# Apply an operation to each element of a dataframe column
cars["Odometer"] = cars["Odometer"].apply(lambda km: km / 1.6093) # Convert kilometers to miles
cars
Make | Colour | Odometer | Doors | Price | Seats | Fuel Efficiency | |
---|---|---|---|---|---|---|---|
0 | Toyota | White | 93234.946871 | 4 | 4000.0 | 5.0 | 25.3 |
1 | Honda | Red | 54619.399739 | 4 | 5000.0 | 5.0 | 19.2 |
2 | Toyota | Blue | 20225.563910 | 3 | 7000.0 | 5.0 | 22.0 |
3 | BMW | Black | 6946.498478 | 5 | 22000.0 | 5.0 | 28.0 |
4 | Nissan | White | 132414.714472 | 4 | 3500.0 | 5.0 | 36.0 |
5 | Toyota | Green | 61649.785621 | 4 | 4500.0 | 5.0 | 26.9 |
6 | Honda | Blue | 28396.197104 | 4 | 7500.0 | 5.0 | 24.4 |
7 | Honda | Blue | 34013.546262 | 4 | 7000.0 | 5.0 | 24.6 |
8 | Toyota | White | 37283.290872 | 4 | 6250.0 | 5.0 | 26.5 |
9 | Nissan | White | 19635.866526 | 4 | 9700.0 | 5.0 | 29.8 |
cars["Total Fuel Consumption"] = cars["Odometer"] / cars["Fuel Efficiency"]
cars
Make | Colour | Odometer | Doors | Price | Seats | Fuel Efficiency | Total Fuel Consumption | |
---|---|---|---|---|---|---|---|---|
0 | Toyota | White | 93234.946871 | 4 | 4000.0 | 5.0 | 25.3 | 3685.175766 |
1 | Honda | Red | 54619.399739 | 4 | 5000.0 | 5.0 | 19.2 | 2844.760403 |
2 | Toyota | Blue | 20225.563910 | 3 | 7000.0 | 5.0 | 22.0 | 919.343814 |
3 | BMW | Black | 6946.498478 | 5 | 22000.0 | 5.0 | 28.0 | 248.089231 |
4 | Nissan | White | 132414.714472 | 4 | 3500.0 | 5.0 | 36.0 | 3678.186513 |
5 | Toyota | Green | 61649.785621 | 4 | 4500.0 | 5.0 | 26.9 | 2291.813592 |
6 | Honda | Blue | 28396.197104 | 4 | 7500.0 | 5.0 | 24.4 | 1163.778570 |
7 | Honda | Blue | 34013.546262 | 4 | 7000.0 | 5.0 | 24.6 | 1382.664482 |
8 | Toyota | White | 37283.290872 | 4 | 6250.0 | 5.0 | 26.5 | 1406.916637 |
9 | Nissan | White | 19635.866526 | 4 | 9700.0 | 5.0 | 29.8 | 658.921695 |
# Creating column from a single value
cars["Wheels"] = 4
cars
Make | Colour | Odometer | Doors | Price | Seats | Fuel Efficiency | Total Fuel Consumption | Wheels | |
---|---|---|---|---|---|---|---|---|---|
0 | Toyota | White | 93234.946871 | 4 | 4000.0 | 5.0 | 25.3 | 3685.175766 | 4 |
1 | Honda | Red | 54619.399739 | 4 | 5000.0 | 5.0 | 19.2 | 2844.760403 | 4 |
2 | Toyota | Blue | 20225.563910 | 3 | 7000.0 | 5.0 | 22.0 | 919.343814 | 4 |
3 | BMW | Black | 6946.498478 | 5 | 22000.0 | 5.0 | 28.0 | 248.089231 | 4 |
4 | Nissan | White | 132414.714472 | 4 | 3500.0 | 5.0 | 36.0 | 3678.186513 | 4 |
5 | Toyota | Green | 61649.785621 | 4 | 4500.0 | 5.0 | 26.9 | 2291.813592 | 4 |
6 | Honda | Blue | 28396.197104 | 4 | 7500.0 | 5.0 | 24.4 | 1163.778570 | 4 |
7 | Honda | Blue | 34013.546262 | 4 | 7000.0 | 5.0 | 24.6 | 1382.664482 | 4 |
8 | Toyota | White | 37283.290872 | 4 | 6250.0 | 5.0 | 26.5 | 1406.916637 | 4 |
9 | Nissan | White | 19635.866526 | 4 | 9700.0 | 5.0 | 29.8 | 658.921695 | 4 |
cars["Passed Road Safety"] = True
cars.dtypes
Make object Colour object Odometer float64 Doors int64 Price float64 Seats float64 Fuel Efficiency float64 Total Fuel Consumption float64 Wheels int64 Passed Road Safety bool dtype: object
cars
Make | Colour | Odometer | Doors | Price | Seats | Fuel Efficiency | Total Fuel Consumption | Wheels | Passed Road Safety | |
---|---|---|---|---|---|---|---|---|---|---|
0 | Toyota | White | 93234.946871 | 4 | 4000.0 | 5.0 | 25.3 | 3685.175766 | 4 | True |
1 | Honda | Red | 54619.399739 | 4 | 5000.0 | 5.0 | 19.2 | 2844.760403 | 4 | True |
2 | Toyota | Blue | 20225.563910 | 3 | 7000.0 | 5.0 | 22.0 | 919.343814 | 4 | True |
3 | BMW | Black | 6946.498478 | 5 | 22000.0 | 5.0 | 28.0 | 248.089231 | 4 | True |
4 | Nissan | White | 132414.714472 | 4 | 3500.0 | 5.0 | 36.0 | 3678.186513 | 4 | True |
5 | Toyota | Green | 61649.785621 | 4 | 4500.0 | 5.0 | 26.9 | 2291.813592 | 4 | True |
6 | Honda | Blue | 28396.197104 | 4 | 7500.0 | 5.0 | 24.4 | 1163.778570 | 4 | True |
7 | Honda | Blue | 34013.546262 | 4 | 7000.0 | 5.0 | 24.6 | 1382.664482 | 4 | True |
8 | Toyota | White | 37283.290872 | 4 | 6250.0 | 5.0 | 26.5 | 1406.916637 | 4 | True |
9 | Nissan | White | 19635.866526 | 4 | 9700.0 | 5.0 | 29.8 | 658.921695 | 4 | True |
if "Wheels" in cars.columns:
del cars["Wheels"]
cars
Make | Colour | Odometer | Doors | Price | Seats | Fuel Efficiency | Total Fuel Consumption | Passed Road Safety | |
---|---|---|---|---|---|---|---|---|---|
0 | Toyota | White | 93234.946871 | 4 | 4000.0 | 5.0 | 25.3 | 3685.175766 | True |
1 | Honda | Red | 54619.399739 | 4 | 5000.0 | 5.0 | 19.2 | 2844.760403 | True |
2 | Toyota | Blue | 20225.563910 | 3 | 7000.0 | 5.0 | 22.0 | 919.343814 | True |
3 | BMW | Black | 6946.498478 | 5 | 22000.0 | 5.0 | 28.0 | 248.089231 | True |
4 | Nissan | White | 132414.714472 | 4 | 3500.0 | 5.0 | 36.0 | 3678.186513 | True |
5 | Toyota | Green | 61649.785621 | 4 | 4500.0 | 5.0 | 26.9 | 2291.813592 | True |
6 | Honda | Blue | 28396.197104 | 4 | 7500.0 | 5.0 | 24.4 | 1163.778570 | True |
7 | Honda | Blue | 34013.546262 | 4 | 7000.0 | 5.0 | 24.6 | 1382.664482 | True |
8 | Toyota | White | 37283.290872 | 4 | 6250.0 | 5.0 | 26.5 | 1406.916637 | True |
9 | Nissan | White | 19635.866526 | 4 | 9700.0 | 5.0 | 29.8 | 658.921695 | True |
# Select a percentage of the dataframe in random order
cars_shuffled = cars.sample(frac=1)
cars_shuffled
Make | Colour | Odometer | Doors | Price | Seats | Fuel Efficiency | Total Fuel Consumption | Passed Road Safety | |
---|---|---|---|---|---|---|---|---|---|
2 | Toyota | Blue | 20225.563910 | 3 | 7000.0 | 5.0 | 22.0 | 919.343814 | True |
9 | Nissan | White | 19635.866526 | 4 | 9700.0 | 5.0 | 29.8 | 658.921695 | True |
1 | Honda | Red | 54619.399739 | 4 | 5000.0 | 5.0 | 19.2 | 2844.760403 | True |
6 | Honda | Blue | 28396.197104 | 4 | 7500.0 | 5.0 | 24.4 | 1163.778570 | True |
5 | Toyota | Green | 61649.785621 | 4 | 4500.0 | 5.0 | 26.9 | 2291.813592 | True |
4 | Nissan | White | 132414.714472 | 4 | 3500.0 | 5.0 | 36.0 | 3678.186513 | True |
7 | Honda | Blue | 34013.546262 | 4 | 7000.0 | 5.0 | 24.6 | 1382.664482 | True |
0 | Toyota | White | 93234.946871 | 4 | 4000.0 | 5.0 | 25.3 | 3685.175766 | True |
3 | BMW | Black | 6946.498478 | 5 | 22000.0 | 5.0 | 28.0 | 248.089231 | True |
8 | Toyota | White | 37283.290872 | 4 | 6250.0 | 5.0 | 26.5 | 1406.916637 | True |
cars_shuffled.reset_index(drop=True, inplace=True)
cars_shuffled
Make | Colour | Odometer | Doors | Price | Seats | Fuel Efficiency | Total Fuel Consumption | Passed Road Safety | |
---|---|---|---|---|---|---|---|---|---|
0 | Toyota | Blue | 20225.563910 | 3 | 7000.0 | 5.0 | 22.0 | 919.343814 | True |
1 | Nissan | White | 19635.866526 | 4 | 9700.0 | 5.0 | 29.8 | 658.921695 | True |
2 | Honda | Red | 54619.399739 | 4 | 5000.0 | 5.0 | 19.2 | 2844.760403 | True |
3 | Honda | Blue | 28396.197104 | 4 | 7500.0 | 5.0 | 24.4 | 1163.778570 | True |
4 | Toyota | Green | 61649.785621 | 4 | 4500.0 | 5.0 | 26.9 | 2291.813592 | True |
5 | Nissan | White | 132414.714472 | 4 | 3500.0 | 5.0 | 36.0 | 3678.186513 | True |
6 | Honda | Blue | 34013.546262 | 4 | 7000.0 | 5.0 | 24.6 | 1382.664482 | True |
7 | Toyota | White | 93234.946871 | 4 | 4000.0 | 5.0 | 25.3 | 3685.175766 | True |
8 | BMW | Black | 6946.498478 | 5 | 22000.0 | 5.0 | 28.0 | 248.089231 | True |
9 | Toyota | White | 37283.290872 | 4 | 6250.0 | 5.0 | 26.5 | 1406.916637 | True |