import pandas as pd
%pwd
# downloaded survey data; now need to change working directory to retrieve it
%cd /Users/alisonfowler/Docs/Jupityr-notebooks/
# Note that pd.read_csv is used because we imported pandas as pd
pd.read_csv("data/surveys.csv")
surveys_df = pd.read_csv("data/surveys.csv")
surveys_df
surveys_df.head()
type(surveys_df)
surveys_df.dtypes # the formats of different types of data
# CHALLENGE : DATAFRAMES
# 1. surveys_df.columns
# 2. surveys_df.shape Take note of the output of shape - what format does it return the shape of the DataFrame in?
# HINT: More on tuples, here.
# 3. surveys_df.head() Also, what does surveys_df.head(15) do?
# 4. surveys_df.tail()
surveys_df.columns # returns column names
surveys_df.shape # dimensions - returns the shape as a vector
surveys_df.head() # displays first five rows of the dataframe
surveys_df.head(15) # displays first fifteen rows of the dataframe
surveys_df.tail() # displays last five rows
pd.unique(surveys_df['species_id']) # list all the unique species
# CHALLENGE : STATISTICS
# 1. Create a list of unique site ID’s (“plot_id”) found in the surveys data. Call it site_names.
# How many unique sites are there in the data? How many unique species are in the data?
# 2. What is the difference between len(site_names) and surveys_df['plot_id'].nunique()?
site_names=pd.unique(surveys_df['plot_id']) # create a new object with site names
site_names # look at new object
site_names.shape # how many sites are there
species_names=pd.unique(surveys_df['species_id']) # create a new object with species names
species_names # look
species_names.shape # how many species are there
len(species_names)
len(site_names) # length of vector
surveys_df['plot_id'].nunique() # number of unique values in the vector
# different from previous line b/c the vector could have repeating values.
# in this case, it returns the same number since all values are unique
surveys_df['weight'].describe() # get basic stats for a given column
surveys_df['weight'].min() # extract a stat
# Groups
# Group data by sex
grouped_data = surveys_df.groupby('sex')
# Summary statistics for all numeric columns by sex
grouped_data.describe()
# Provide the mean for each numeric column by sex
grouped_data.mean()
# CHALLENGE : SUMMARY DATA
# 1. How many recorded individuals are female F and how many male M?
# 2. What happens when you group by two columns using the following syntax and then calculate mean values?
# grouped_data2 = surveys_df.groupby(['plot_id', 'sex'])
# grouped_data2.mean()
# 3. Summarize weight values for each site in your data.
# HINT: you can use the following syntax to only create summary statistics for one column in your data.
# by_site['weight'].describe()
grouped_data.count() # number of recorded individuals
# females: 15,690
# males: 17,348
# apparently some missing data for hindfoot length and weights?
grouped_data2 = surveys_df.groupby(['plot_id', 'sex']) # create a new object grouped by two variables
grouped_data2.mean() # displays mean for each variable grouped by sexes within each plot/site
grouped_data2['weight'].describe() # gives information about weights within groups
# group by site only
grouped_data3 = surveys_df.groupby('plot_id')
grouped_data3.mean()
grouped_data3['weight'].describe() # gives information about weights within sites
# but this output looks slightly different from the one on the tutorial site, which has the columns & rows switched
# Quickly creating summary counts
# Count the number of samples by species
species_counts = surveys_df.groupby('species_id')['record_id'].count()
print(species_counts)
surveys_df.groupby('species_id')['record_id'].count()['DO'] # sample count for one particular species
# CHALLENGE : MAKE A LIST
# What’s another way to create a list of species and associated count of the records in the data?
# Hint: you can perform count, min, etc. functions on groupby DataFrames in the same way you can perform them
# on regular DataFrames.
surveys_df.head()
grouped_data4 = surveys_df.groupby('species_id')
grouped_data4.count() # this works?
# Basic Math functions
# Multiply all weight values by 2
surveys_df['weight']*2
# Plotting
# Make sure figures appear inline in Ipython Notebook
%matplotlib inline
# Create a quick bar chart
species_counts.plot(kind='bar'); # what does the semi-colon do?
total_count = surveys_df.groupby('plot_id')['record_id'].nunique()
# Let's plot that too
total_count.plot(kind='bar');
# CHALLENGE: PLOTS
# 1. Create a plot of average weight across all species per site.
# 2. Create a plot of total males versus total females for the entire dataset.
site_weights = surveys_df.groupby('plot_id')['weight'] # group weights by site
site_weights.head()
site_mean_weights = site_weights.mean() # extract means
site_mean_weights.head()
site_mean_weights.plot(kind = 'bar'); # plot mean weights of sites
sex_counts = surveys_df.groupby('sex')['record_id'].count()
print(sex_counts)
sex_counts.plot(kind = 'bar'); # plot number of records of each sex of entire data set
# SUMMARY PLOTTING CHALLENGE
# Create a stacked bar plot, with weight on the Y axis, and the stacked variable being sex.
# The plot should show total weight by sex for each site.
# Example data for making the plot
d = {'one' : pd.Series([1., 2., 3.], index=['a', 'b', 'c']), 'two' : pd.Series([1., 2., 3., 4.], index=['a', 'b', 'c', 'd'])}
pd.DataFrame(d)
# Plot stacked data so columns 'one' and 'two' are stacked
my_df = pd.DataFrame(d)
my_df.plot(kind='bar', stacked=True, title="Example graph")
# let's try to do this with weights by sites and sex
site_sex = surveys_df.groupby(['plot_id', 'sex'])
site_sex_count = site_sex['weight'].sum()
print(site_sex_count)
site_sex_count.unstack()
# transforms grouped data into columns
site_sex_count.plot(kind='bar', stacked=True, title="Total weight by site and sex")
# so if we don't unstack first, it doesn't allow us to stack two variables within a site
to_plot = site_sex_count.unstack()
to_plot.plot(kind='bar', stacked=True, title="Total weight by site and sex") # yay