%cd /Users/alisonfowler/Docs/Jupityr-notebooks/
import pandas as pd
surveys_df = pd.read_csv("data/surveys.csv")
# Selecting data using labels (columns)
# Method 1: select a 'subset' of the data using the column name
surveys_df['species_id']
# Method 2: use the column name as an 'attribute'; gives the same output
surveys_df.species_id
# Creates an object, surveys_species, that only contains the `species_id` column
surveys_species = surveys_df['species_id']
# Select the species and plot columns from the DataFrame
surveys_df[['species_id', 'plot_id']]
# What happens when you flip the order?
surveys_df[['plot_id', 'species_id']]
# What happens if you ask for a column that doesn't exist?
surveys_df['speciess']
# Python uses zero-based indexing, unlike R
# Create a list of numbers:
a = [1, 2, 3, 4, 5]
# indexing: getting a specific element
# slicing: selecting a set of elements
# CHALLENGE
# 1. What value does the code below return?
# a[0]
# 2. How about this:
# a[5]
# 3. In the example above, calling a[5] returns an error. Why is that?
# 4. What about?
# a[len(a)]
a[0]
a[5] # error because there are only spots 0-4
a[len(a)]
# Slicing subsets of rows in Python
# Select rows 0, 1, 2 (row 3 is not selected)
surveys_df[0:3]
# Select the first 5 rows (rows 0, 1, 2, 3, 4)
surveys_df[:5]
# Select the last element in the list
# (the slice starts at the last element, and ends at the end of the list)
surveys_df[-1:]
# Copying vs referencing objects
# Using the 'copy() method'
true_copy_surveys_df = surveys_df.copy()
# this creates a copy of the dataframe, resulting in two objects
# Using the '=' operator
ref_surveys_df = surveys_df
# this does not create a copy - there is really only one object here, but now there are two ways to refer to it
# Assign the value `0` to the first three rows of data in the DataFrame
ref_surveys_df[0:3] = 0
# ref_surveys_df was created using the '=' operator
ref_surveys_df.head()
# surveys_df is the original dataframe
surveys_df.head()
# it also got modified!
# read in the original csv again
surveys_df = pd.read_csv("data/surveys.csv")
# Slicing subsets of rows and columns
# loc is primarily label based indexing. Integers may be used but they are interpreted as a label.
# iloc is primarily integer based indexing
# iloc[row slicing, column slicing]
surveys_df.iloc[0:3, 1:4]
# Notice that we asked for a slice from 0:3. This yielded 3 rows of data.
# When you ask for 0:3, you are actually telling Python to start at index 0 and select rows 0, 1, 2 up to but
# not including 3.
# Select all columns for rows of index values 0 and 10
surveys_df.loc[[0, 10], :]
# What does this do?
surveys_df.loc[0, ['species_id', 'plot_id', 'weight']]
# pulls out the data in the first row of these three columns
# What happens when you type the code below?
surveys_df.loc[[0, 10, 35549], :]
# error came up because we asked for a number (35549) that is out of range, I think?
# Indexing by labels loc differs from indexing by integers iloc.
# With loc, both the start bound and the stop bound are inclusive.
# When using loc, integers can be used, but the integers refer to the index label and not the position.
# For example, using loc and select 1:4 will get a different result than using iloc to select rows 1:4.
# Syntax for iloc indexing to finding a specific data element
# dat.iloc[row, column]
surveys_df.iloc[2, 6]
# CHALLENGE : RANGE
# 1. What happens when you execute:
# surveys_df[0:1]
# surveys_df[:4]
# surveys_df[:-1]
# 2. What happens when you call:
# surveys_df.iloc[0:4, 1:4]
# surveys_df.loc[0:4, 1:4]
# 3. How are the two commands different?
surveys_df[0:1] # first row, all columns
# so I think if we don't put a comma, it assumes all columns? (Unlike R)
surveys_df[:4] # first four rows, all columns
surveys_df[:-1] # all rows except the last row, all columns
# Notes from: https://realpython.com/lessons/indexing-and-slicing/
# If a is a list, then a[m:n] returns the portion of a:
# Starting with postion m
# Up to but not including n
# Omitting the first index a[:n] starts the slice at the beginning of the list.
# Omitting the last index a[m:] extends the slice from the first index m to the end of the list.
# Omitting both indexes a[:] returns a copy of the entire list, but unlike with a string, it’s a copy,
# not a reference to the same object.
surveys_df.iloc[0:4, 1:4]
# first four rows, and 2nd, 3rd, and 4th columns
# (up until but not including 5th (position 4) column)
surveys_df.loc[0:4, 1:4]
# I think this returns an error because none of the columns are named by these integers?
# Maybe it would work if we used the words "month", etc. ?
surveys_df.loc[0:4, "month":"year"] # I wonder if that was the problem?
# Subsetting data using criteria
surveys_df[surveys_df.year == 2002] # get only samples collected in 2002
surveys_df[surveys_df.year != 2002] # all years except 2002
surveys_df[(surveys_df.year >= 1980) & (surveys_df.year <= 1985)] # between 1980-1985
# Python syntax cheat sheet
# Equals: ==
# Not equals: !=
# Greater than, less than: > or <
# Greater than or equal to >=
# Less than or equal to <=
# CHALLENGE : QUERIES
# 1. Select a subset of rows in the surveys_df DataFrame that contain data from the year 1999 and that contain
# weight values less than or equal to 8. How many rows did you end up with? What did your neighbor get?
# 2. You can use the isin command in Python to query a DataFrame based upon a list of values as follows:
# surveys_df[surveys_df['species_id'].isin([listGoesHere])]
# Use the isin function to find all plots that contain particular species in the “surveys” DataFrame.
# How many records contain these values?
# Experiment with other queries. Create a query that finds all rows with a weight value > or equal to 0.
# The ~ symbol in Python can be used to return the OPPOSITE of the selection that you specify in Python.
# It is equivalent to is not in. Write a query that selects all rows with sex NOT equal to ‘M’ or ‘F’ in the “surveys” data.
surveys_df[(surveys_df.year == 1999) & (surveys_df.weight <= 8)] # five rows returned
surveys_df[surveys_df['species_id'].isin(['DO'])] # returns all rows with this species ID
surveys_df[(surveys_df.weight >= 0)] # weights greater than or equal to zero
surveys_df[~surveys_df['sex'].isin(["M","F"])] # returns rows neither M or F
# Using masks to identify a specific condition
# Set x to 5
x = 5
# What does the code below return?
x > 5
# How about this?
x == 5
# Creating a Boolean mask
# We can set the True / False criteria & then python will create an output object like the original object but
# with True or False for each index location
pd.isnull(surveys_df)
# To select just the rows with NaN values, we can use the 'any()' method
surveys_df[pd.isnull(surveys_df).any(axis=1)]
# What does this do?
empty_weights = surveys_df[pd.isnull(surveys_df['weight'])]['weight']
print(empty_weights)
# CHALLENGE : Putting it all together
# 1. Create a new DataFrame that only contains observations with sex values that are not female or male.
# Assign each sex value in the new DataFrame to a new value of ‘x’. Determine the number of null values in the subset.
empty_sex = surveys_df[pd.isnull(surveys_df['sex'])]['sex']
print(empty_sex)
# not sure how to assign each sex value with a new value
# there are 2511 samples with no sex assigned
# 2. Create a new DataFrame that contains only observations that are of sex male or female and where weight values
# are greater than 0. Create a stacked bar plot of average weight by plot with male vs female values stacked for
# each plot.
weights = surveys_df[surveys_df['sex'].isin(["M","F"]) & (surveys_df.weight > 0)]
print(weights)
weights_site = weights.groupby(['plot_id', 'sex'])
weights_site_means = weights_site["weight"].mean()
print(weights_site_means)
weights_plot = weights_site_means.unstack()
weights_plot.plot(kind='bar', stacked=True, title="Average weights by site")