In [2]:
%cd /Users/alisonfowler/Docs/Jupityr-notebooks/
/Users/alisonfowler/Docs/Jupityr-notebooks
In [3]:
import pandas as pd
In [4]:
surveys_df = pd.read_csv("data/surveys.csv")
In [7]:
# Selecting data using labels (columns)

# Method 1: select a 'subset' of the data using the column name
surveys_df['species_id']
Out[7]:
0         NL
1         NL
2         DM
3         DM
4         DM
        ... 
35544     AH
35545     AH
35546     RM
35547     DO
35548    NaN
Name: species_id, Length: 35549, dtype: object
In [8]:
# Method 2: use the column name as an 'attribute'; gives the same output
surveys_df.species_id
Out[8]:
0         NL
1         NL
2         DM
3         DM
4         DM
        ... 
35544     AH
35545     AH
35546     RM
35547     DO
35548    NaN
Name: species_id, Length: 35549, dtype: object
In [9]:
# Creates an object, surveys_species, that only contains the `species_id` column
surveys_species = surveys_df['species_id']
In [10]:
# Select the species and plot columns from the DataFrame
surveys_df[['species_id', 'plot_id']]
Out[10]:
species_id plot_id
0 NL 2
1 NL 3
2 DM 2
3 DM 7
4 DM 3
... ... ...
35544 AH 15
35545 AH 15
35546 RM 10
35547 DO 7
35548 NaN 5

35549 rows × 2 columns

In [11]:
# What happens when you flip the order?
surveys_df[['plot_id', 'species_id']]
Out[11]:
plot_id species_id
0 2 NL
1 3 NL
2 2 DM
3 7 DM
4 3 DM
... ... ...
35544 15 AH
35545 15 AH
35546 10 RM
35547 7 DO
35548 5 NaN

35549 rows × 2 columns

In [12]:
# What happens if you ask for a column that doesn't exist?
surveys_df['speciess']
---------------------------------------------------------------------------
KeyError                                  Traceback (most recent call last)
~/opt/anaconda3/lib/python3.8/site-packages/pandas/core/indexes/base.py in get_loc(self, key, method, tolerance)
   2645             try:
-> 2646                 return self._engine.get_loc(key)
   2647             except KeyError:

pandas/_libs/index.pyx in pandas._libs.index.IndexEngine.get_loc()

pandas/_libs/index.pyx in pandas._libs.index.IndexEngine.get_loc()

pandas/_libs/hashtable_class_helper.pxi in pandas._libs.hashtable.PyObjectHashTable.get_item()

pandas/_libs/hashtable_class_helper.pxi in pandas._libs.hashtable.PyObjectHashTable.get_item()

KeyError: 'speciess'

During handling of the above exception, another exception occurred:

KeyError                                  Traceback (most recent call last)
<ipython-input-12-211eec9ce8b9> in <module>
      1 # What happens if you ask for a column that doesn't exist?
----> 2 surveys_df['speciess']

~/opt/anaconda3/lib/python3.8/site-packages/pandas/core/frame.py in __getitem__(self, key)
   2798             if self.columns.nlevels > 1:
   2799                 return self._getitem_multilevel(key)
-> 2800             indexer = self.columns.get_loc(key)
   2801             if is_integer(indexer):
   2802                 indexer = [indexer]

~/opt/anaconda3/lib/python3.8/site-packages/pandas/core/indexes/base.py in get_loc(self, key, method, tolerance)
   2646                 return self._engine.get_loc(key)
   2647             except KeyError:
-> 2648                 return self._engine.get_loc(self._maybe_cast_indexer(key))
   2649         indexer = self.get_indexer([key], method=method, tolerance=tolerance)
   2650         if indexer.ndim > 1 or indexer.size > 1:

pandas/_libs/index.pyx in pandas._libs.index.IndexEngine.get_loc()

pandas/_libs/index.pyx in pandas._libs.index.IndexEngine.get_loc()

pandas/_libs/hashtable_class_helper.pxi in pandas._libs.hashtable.PyObjectHashTable.get_item()

pandas/_libs/hashtable_class_helper.pxi in pandas._libs.hashtable.PyObjectHashTable.get_item()

KeyError: 'speciess'
In [13]:
# Python uses zero-based indexing, unlike R 
# Create a list of numbers:
a = [1, 2, 3, 4, 5]
In [ ]:
# indexing: getting a specific element 
# slicing: selecting a set of elements 

# CHALLENGE 
# 1. What value does the code below return?
#   a[0]
# 2. How about this:
#   a[5]
# 3. In the example above, calling a[5] returns an error. Why is that?
# 4. What about?
#   a[len(a)]
In [14]:
a[0]
Out[14]:
1
In [15]:
a[5] # error because there are only spots 0-4  
---------------------------------------------------------------------------
IndexError                                Traceback (most recent call last)
<ipython-input-15-4a84d856522b> in <module>
----> 1 a[5]

IndexError: list index out of range
In [16]:
a[len(a)] 
---------------------------------------------------------------------------
IndexError                                Traceback (most recent call last)
<ipython-input-16-836f840eaf9f> in <module>
----> 1 a[len(a)]

IndexError: list index out of range
In [ ]:
# Slicing subsets of rows in Python 
In [17]:
# Select rows 0, 1, 2 (row 3 is not selected)
surveys_df[0:3]
Out[17]:
record_id month day year plot_id species_id sex hindfoot_length weight
0 1 7 16 1977 2 NL M 32.0 NaN
1 2 7 16 1977 3 NL M 33.0 NaN
2 3 7 16 1977 2 DM F 37.0 NaN
In [18]:
# Select the first 5 rows (rows 0, 1, 2, 3, 4)
surveys_df[:5]
Out[18]:
record_id month day year plot_id species_id sex hindfoot_length weight
0 1 7 16 1977 2 NL M 32.0 NaN
1 2 7 16 1977 3 NL M 33.0 NaN
2 3 7 16 1977 2 DM F 37.0 NaN
3 4 7 16 1977 7 DM M 36.0 NaN
4 5 7 16 1977 3 DM M 35.0 NaN
In [19]:
# Select the last element in the list
# (the slice starts at the last element, and ends at the end of the list)
surveys_df[-1:]
Out[19]:
record_id month day year plot_id species_id sex hindfoot_length weight
35548 35549 12 31 2002 5 NaN NaN NaN NaN
In [ ]:
# Copying vs referencing objects
In [20]:
# Using the 'copy() method'
true_copy_surveys_df = surveys_df.copy()
# this creates a copy of the dataframe, resulting in two objects
In [21]:
# Using the '=' operator
ref_surveys_df = surveys_df
# this does not create a copy - there is really only one object here, but now there are two ways to refer to it 
In [22]:
# Assign the value `0` to the first three rows of data in the DataFrame
ref_surveys_df[0:3] = 0
In [23]:
# ref_surveys_df was created using the '=' operator
ref_surveys_df.head()
Out[23]:
record_id month day year plot_id species_id sex hindfoot_length weight
0 0 0 0 0 0 0 0 0.0 0.0
1 0 0 0 0 0 0 0 0.0 0.0
2 0 0 0 0 0 0 0 0.0 0.0
3 4 7 16 1977 7 DM M 36.0 NaN
4 5 7 16 1977 3 DM M 35.0 NaN
In [24]:
# surveys_df is the original dataframe
surveys_df.head()

# it also got modified! 
Out[24]:
record_id month day year plot_id species_id sex hindfoot_length weight
0 0 0 0 0 0 0 0 0.0 0.0
1 0 0 0 0 0 0 0 0.0 0.0
2 0 0 0 0 0 0 0 0.0 0.0
3 4 7 16 1977 7 DM M 36.0 NaN
4 5 7 16 1977 3 DM M 35.0 NaN
In [25]:
# read in the original csv again
surveys_df = pd.read_csv("data/surveys.csv")
In [ ]:
# Slicing subsets of rows and columns 
# loc is primarily label based indexing. Integers may be used but they are interpreted as a label.
# iloc is primarily integer based indexing
In [26]:
# iloc[row slicing, column slicing]
surveys_df.iloc[0:3, 1:4]
Out[26]:
month day year
0 7 16 1977
1 7 16 1977
2 7 16 1977
In [ ]:
# Notice that we asked for a slice from 0:3. This yielded 3 rows of data. 
# When you ask for 0:3, you are actually telling Python to start at index 0 and select rows 0, 1, 2 up to but 
# not including 3.
In [27]:
# Select all columns for rows of index values 0 and 10
surveys_df.loc[[0, 10], :]
Out[27]:
record_id month day year plot_id species_id sex hindfoot_length weight
0 1 7 16 1977 2 NL M 32.0 NaN
10 11 7 16 1977 5 DS F 53.0 NaN
In [28]:
# What does this do?
surveys_df.loc[0, ['species_id', 'plot_id', 'weight']]

# pulls out the data in the first row of these three columns 
Out[28]:
species_id     NL
plot_id         2
weight        NaN
Name: 0, dtype: object
In [29]:
# What happens when you type the code below?
surveys_df.loc[[0, 10, 35549], :]

# error came up because we asked for a number (35549) that is out of range, I think? 
---------------------------------------------------------------------------
KeyError                                  Traceback (most recent call last)
<ipython-input-29-b85de876aaec> in <module>
      1 # What happens when you type the code below?
----> 2 surveys_df.loc[[0, 10, 35549], :]

~/opt/anaconda3/lib/python3.8/site-packages/pandas/core/indexing.py in __getitem__(self, key)
   1760                 except (KeyError, IndexError, AttributeError):
   1761                     pass
-> 1762             return self._getitem_tuple(key)
   1763         else:
   1764             # we by definition only have the 0th axis

~/opt/anaconda3/lib/python3.8/site-packages/pandas/core/indexing.py in _getitem_tuple(self, tup)
   1287                 continue
   1288 
-> 1289             retval = getattr(retval, self.name)._getitem_axis(key, axis=i)
   1290 
   1291         return retval

~/opt/anaconda3/lib/python3.8/site-packages/pandas/core/indexing.py in _getitem_axis(self, key, axis)
   1952                     raise ValueError("Cannot index with multidimensional key")
   1953 
-> 1954                 return self._getitem_iterable(key, axis=axis)
   1955 
   1956             # nested tuple slicing

~/opt/anaconda3/lib/python3.8/site-packages/pandas/core/indexing.py in _getitem_iterable(self, key, axis)
   1593         else:
   1594             # A collection of keys
-> 1595             keyarr, indexer = self._get_listlike_indexer(key, axis, raise_missing=False)
   1596             return self.obj._reindex_with_indexers(
   1597                 {axis: [keyarr, indexer]}, copy=True, allow_dups=True

~/opt/anaconda3/lib/python3.8/site-packages/pandas/core/indexing.py in _get_listlike_indexer(self, key, axis, raise_missing)
   1550             keyarr, indexer, new_indexer = ax._reindex_non_unique(keyarr)
   1551 
-> 1552         self._validate_read_indexer(
   1553             keyarr, indexer, o._get_axis_number(axis), raise_missing=raise_missing
   1554         )

~/opt/anaconda3/lib/python3.8/site-packages/pandas/core/indexing.py in _validate_read_indexer(self, key, indexer, axis, raise_missing)
   1652             # just raising
   1653             if not (ax.is_categorical() or ax.is_interval()):
-> 1654                 raise KeyError(
   1655                     "Passing list-likes to .loc or [] with any missing labels "
   1656                     "is no longer supported, see "

KeyError: 'Passing list-likes to .loc or [] with any missing labels is no longer supported, see https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#deprecate-loc-reindex-listlike'
In [ ]:
# Indexing by labels loc differs from indexing by integers iloc. 

# With loc, both the start bound and the stop bound are inclusive. 
# When using loc, integers can be used, but the integers refer to the index label and not the position. 

# For example, using loc and select 1:4 will get a different result than using iloc to select rows 1:4.
In [ ]:
# Syntax for iloc indexing to finding a specific data element
# dat.iloc[row, column]
In [30]:
surveys_df.iloc[2, 6]
Out[30]:
'F'
In [ ]:
# CHALLENGE : RANGE 

# 1. What happens when you execute:

# surveys_df[0:1]
# surveys_df[:4]
# surveys_df[:-1]

# 2. What happens when you call:

# surveys_df.iloc[0:4, 1:4]
# surveys_df.loc[0:4, 1:4]

# 3. How are the two commands different?
In [31]:
surveys_df[0:1] # first row, all columns 
# so I think if we don't put a comma, it assumes all columns? (Unlike R)
Out[31]:
record_id month day year plot_id species_id sex hindfoot_length weight
0 1 7 16 1977 2 NL M 32.0 NaN
In [32]:
surveys_df[:4] # first four rows, all columns 
Out[32]:
record_id month day year plot_id species_id sex hindfoot_length weight
0 1 7 16 1977 2 NL M 32.0 NaN
1 2 7 16 1977 3 NL M 33.0 NaN
2 3 7 16 1977 2 DM F 37.0 NaN
3 4 7 16 1977 7 DM M 36.0 NaN
In [33]:
surveys_df[:-1] # all rows except the last row, all columns 
Out[33]:
record_id month day year plot_id species_id sex hindfoot_length weight
0 1 7 16 1977 2 NL M 32.0 NaN
1 2 7 16 1977 3 NL M 33.0 NaN
2 3 7 16 1977 2 DM F 37.0 NaN
3 4 7 16 1977 7 DM M 36.0 NaN
4 5 7 16 1977 3 DM M 35.0 NaN
... ... ... ... ... ... ... ... ... ...
35543 35544 12 31 2002 15 US NaN NaN NaN
35544 35545 12 31 2002 15 AH NaN NaN NaN
35545 35546 12 31 2002 15 AH NaN NaN NaN
35546 35547 12 31 2002 10 RM F 15.0 14.0
35547 35548 12 31 2002 7 DO M 36.0 51.0

35548 rows × 9 columns

In [ ]:
# Notes from: https://realpython.com/lessons/indexing-and-slicing/ 
# If a is a list, then a[m:n] returns the portion of a: 
# Starting with postion m
# Up to but not including n

# Omitting the first index a[:n] starts the slice at the beginning of the list.
# Omitting the last index a[m:] extends the slice from the first index m to the end of the list.
# Omitting both indexes a[:] returns a copy of the entire list, but unlike with a string, it’s a copy, 
# not a reference to the same object.
In [5]:
surveys_df.iloc[0:4, 1:4] 
# first four rows, and 2nd, 3rd, and 4th columns 
# (up until but not including 5th (position 4) column)
Out[5]:
month day year
0 7 16 1977
1 7 16 1977
2 7 16 1977
3 7 16 1977
In [7]:
surveys_df.loc[0:4, 1:4] 
# I think this returns an error because none of the columns are named by these integers? 
# Maybe it would work if we used the words "month", etc. ? 
---------------------------------------------------------------------------
TypeError                                 Traceback (most recent call last)
<ipython-input-7-a48f5f17519f> in <module>
----> 1 surveys_df.loc[0:4, 1:4]
      2 # I think this returns an error because none of the columns are named by these integers?
      3 # Maybe it would work if we used the words "month", etc. ?

~/opt/anaconda3/lib/python3.8/site-packages/pandas/core/indexing.py in __getitem__(self, key)
   1760                 except (KeyError, IndexError, AttributeError):
   1761                     pass
-> 1762             return self._getitem_tuple(key)
   1763         else:
   1764             # we by definition only have the 0th axis

~/opt/anaconda3/lib/python3.8/site-packages/pandas/core/indexing.py in _getitem_tuple(self, tup)
   1287                 continue
   1288 
-> 1289             retval = getattr(retval, self.name)._getitem_axis(key, axis=i)
   1290 
   1291         return retval

~/opt/anaconda3/lib/python3.8/site-packages/pandas/core/indexing.py in _getitem_axis(self, key, axis)
   1910         if isinstance(key, slice):
   1911             self._validate_key(key, axis)
-> 1912             return self._get_slice_axis(key, axis=axis)
   1913         elif com.is_bool_indexer(key):
   1914             return self._getbool_axis(key, axis=axis)

~/opt/anaconda3/lib/python3.8/site-packages/pandas/core/indexing.py in _get_slice_axis(self, slice_obj, axis)
   1794 
   1795         labels = obj._get_axis(axis)
-> 1796         indexer = labels.slice_indexer(
   1797             slice_obj.start, slice_obj.stop, slice_obj.step, kind=self.name
   1798         )

~/opt/anaconda3/lib/python3.8/site-packages/pandas/core/indexes/base.py in slice_indexer(self, start, end, step, kind)
   4711         slice(1, 3)
   4712         """
-> 4713         start_slice, end_slice = self.slice_locs(start, end, step=step, kind=kind)
   4714 
   4715         # return a slice

~/opt/anaconda3/lib/python3.8/site-packages/pandas/core/indexes/base.py in slice_locs(self, start, end, step, kind)
   4924         start_slice = None
   4925         if start is not None:
-> 4926             start_slice = self.get_slice_bound(start, "left", kind)
   4927         if start_slice is None:
   4928             start_slice = 0

~/opt/anaconda3/lib/python3.8/site-packages/pandas/core/indexes/base.py in get_slice_bound(self, label, side, kind)
   4836         # For datetime indices label may be a string that has to be converted
   4837         # to datetime boundary according to its resolution.
-> 4838         label = self._maybe_cast_slice_bound(label, side, kind)
   4839 
   4840         # we need to look up the label

~/opt/anaconda3/lib/python3.8/site-packages/pandas/core/indexes/base.py in _maybe_cast_slice_bound(self, label, side, kind)
   4788         # this is rejected (generally .loc gets you here)
   4789         elif is_integer(label):
-> 4790             self._invalid_indexer("slice", label)
   4791 
   4792         return label

~/opt/anaconda3/lib/python3.8/site-packages/pandas/core/indexes/base.py in _invalid_indexer(self, form, key)
   3074         Consistent invalid indexer message.
   3075         """
-> 3076         raise TypeError(
   3077             f"cannot do {form} indexing on {type(self)} with these "
   3078             f"indexers [{key}] of {type(key)}"

TypeError: cannot do slice indexing on <class 'pandas.core.indexes.base.Index'> with these indexers [1] of <class 'int'>
In [8]:
surveys_df.loc[0:4, "month":"year"] # I wonder if that was the problem? 
Out[8]:
month day year
0 7 16 1977
1 7 16 1977
2 7 16 1977
3 7 16 1977
4 7 16 1977
In [9]:
# Subsetting data using criteria 
surveys_df[surveys_df.year == 2002] # get only samples collected in 2002 
Out[9]:
record_id month day year plot_id species_id sex hindfoot_length weight
33320 33321 1 12 2002 1 DM M 38.0 44.0
33321 33322 1 12 2002 1 DO M 37.0 58.0
33322 33323 1 12 2002 1 PB M 28.0 45.0
33323 33324 1 12 2002 1 AB NaN NaN NaN
33324 33325 1 12 2002 1 DO M 35.0 29.0
... ... ... ... ... ... ... ... ... ...
35544 35545 12 31 2002 15 AH NaN NaN NaN
35545 35546 12 31 2002 15 AH NaN NaN NaN
35546 35547 12 31 2002 10 RM F 15.0 14.0
35547 35548 12 31 2002 7 DO M 36.0 51.0
35548 35549 12 31 2002 5 NaN NaN NaN NaN

2229 rows × 9 columns

In [10]:
surveys_df[surveys_df.year != 2002] # all years except 2002 
Out[10]:
record_id month day year plot_id species_id sex hindfoot_length weight
0 1 7 16 1977 2 NL M 32.0 NaN
1 2 7 16 1977 3 NL M 33.0 NaN
2 3 7 16 1977 2 DM F 37.0 NaN
3 4 7 16 1977 7 DM M 36.0 NaN
4 5 7 16 1977 3 DM M 35.0 NaN
... ... ... ... ... ... ... ... ... ...
33315 33316 12 16 2001 11 NaN NaN NaN NaN
33316 33317 12 16 2001 13 NaN NaN NaN NaN
33317 33318 12 16 2001 14 NaN NaN NaN NaN
33318 33319 12 16 2001 15 NaN NaN NaN NaN
33319 33320 12 16 2001 16 NaN NaN NaN NaN

33320 rows × 9 columns

In [11]:
surveys_df[(surveys_df.year >= 1980) & (surveys_df.year <= 1985)] # between 1980-1985 
Out[11]:
record_id month day year plot_id species_id sex hindfoot_length weight
2270 2271 1 15 1980 8 DO M 35.0 53.0
2271 2272 1 15 1980 11 PF F 16.0 10.0
2272 2273 1 15 1980 18 DM F 34.0 33.0
2273 2274 1 15 1980 11 DM M 38.0 37.0
2274 2275 1 15 1980 8 DO F 33.0 29.0
... ... ... ... ... ... ... ... ... ...
11222 11223 12 8 1985 4 DM M 36.0 40.0
11223 11224 12 8 1985 11 DM M 37.0 49.0
11224 11225 12 8 1985 7 PE M 20.0 18.0
11225 11226 12 8 1985 1 DM M 38.0 47.0
11226 11227 12 8 1985 15 NaN NaN NaN NaN

8957 rows × 9 columns

In [ ]:
# Python syntax cheat sheet 
# Equals: ==
# Not equals: !=
# Greater than, less than: > or <
# Greater than or equal to >=
# Less than or equal to <=
In [ ]:
# CHALLENGE : QUERIES 
# 1. Select a subset of rows in the surveys_df DataFrame that contain data from the year 1999 and that contain 
# weight values less than or equal to 8. How many rows did you end up with? What did your neighbor get?

# 2. You can use the isin command in Python to query a DataFrame based upon a list of values as follows:
# surveys_df[surveys_df['species_id'].isin([listGoesHere])]

# Use the isin function to find all plots that contain particular species in the “surveys” DataFrame. 
# How many records contain these values?

# Experiment with other queries. Create a query that finds all rows with a weight value > or equal to 0.
# The ~ symbol in Python can be used to return the OPPOSITE of the selection that you specify in Python. 
# It is equivalent to is not in. Write a query that selects all rows with sex NOT equal to ‘M’ or ‘F’ in the “surveys” data.
In [12]:
surveys_df[(surveys_df.year == 1999) & (surveys_df.weight <= 8)] # five rows returned  
Out[12]:
record_id month day year plot_id species_id sex hindfoot_length weight
29082 29083 1 16 1999 21 RM M 16.0 8.0
29196 29197 2 20 1999 18 RM M 18.0 8.0
29421 29422 3 15 1999 16 RM M 15.0 8.0
29903 29904 10 10 1999 4 PP M 20.0 7.0
29905 29906 10 10 1999 4 PP M 21.0 4.0
In [14]:
surveys_df[surveys_df['species_id'].isin(['DO'])] # returns all rows with this species ID
Out[14]:
record_id month day year plot_id species_id sex hindfoot_length weight
67 68 8 19 1977 8 DO F 32.0 52.0
291 292 10 17 1977 3 DO F 36.0 33.0
293 294 10 17 1977 3 DO F 37.0 50.0
316 317 10 17 1977 17 DO F 32.0 48.0
322 323 10 17 1977 17 DO F 33.0 31.0
... ... ... ... ... ... ... ... ... ...
35502 35503 12 31 2002 6 DO M 33.0 32.0
35515 35516 12 31 2002 11 DO F 35.0 52.0
35517 35518 12 31 2002 11 DO M 36.0 38.0
35526 35527 12 31 2002 13 DO F 33.0 43.0
35547 35548 12 31 2002 7 DO M 36.0 51.0

3027 rows × 9 columns

In [13]:
surveys_df[(surveys_df.weight >= 0)] # weights greater than or equal to zero
Out[13]:
record_id month day year plot_id species_id sex hindfoot_length weight
62 63 8 19 1977 3 DM M 35.0 40.0
63 64 8 19 1977 7 DM M 37.0 48.0
64 65 8 19 1977 4 DM F 34.0 29.0
65 66 8 19 1977 4 DM F 35.0 46.0
66 67 8 19 1977 7 DM M 35.0 36.0
... ... ... ... ... ... ... ... ... ...
35540 35541 12 31 2002 15 PB F 24.0 31.0
35541 35542 12 31 2002 15 PB F 26.0 29.0
35542 35543 12 31 2002 15 PB F 27.0 34.0
35546 35547 12 31 2002 10 RM F 15.0 14.0
35547 35548 12 31 2002 7 DO M 36.0 51.0

32283 rows × 9 columns

In [12]:
surveys_df[~surveys_df['sex'].isin(["M","F"])] # returns rows neither M or F 
Out[12]:
record_id month day year plot_id species_id sex hindfoot_length weight
13 14 7 16 1977 8 DM NaN NaN NaN
18 19 7 16 1977 4 PF NaN NaN NaN
33 34 7 17 1977 17 DM NaN NaN NaN
56 57 7 18 1977 22 DM NaN NaN NaN
76 77 8 19 1977 4 SS NaN NaN NaN
... ... ... ... ... ... ... ... ... ...
35527 35528 12 31 2002 13 US NaN NaN NaN
35543 35544 12 31 2002 15 US NaN NaN NaN
35544 35545 12 31 2002 15 AH NaN NaN NaN
35545 35546 12 31 2002 15 AH NaN NaN NaN
35548 35549 12 31 2002 5 NaN NaN NaN NaN

2511 rows × 9 columns

In [ ]:
# Using masks to identify a specific condition 
In [14]:
# Set x to 5
x = 5
In [15]:
# What does the code below return?
x > 5
Out[15]:
False
In [16]:
# How about this?
x == 5
Out[16]:
True
In [ ]:
# Creating a Boolean mask
# We can set the True / False criteria & then python will create an output object like the original object but 
# with True or False for each index location
In [17]:
pd.isnull(surveys_df)
Out[17]:
record_id month day year plot_id species_id sex hindfoot_length weight
0 False False False False False False False False True
1 False False False False False False False False True
2 False False False False False False False False True
3 False False False False False False False False True
4 False False False False False False False False True
... ... ... ... ... ... ... ... ... ...
35544 False False False False False False True True True
35545 False False False False False False True True True
35546 False False False False False False False False False
35547 False False False False False False False False False
35548 False False False False False True True True True

35549 rows × 9 columns

In [18]:
# To select just the rows with NaN values, we can use the 'any()' method
surveys_df[pd.isnull(surveys_df).any(axis=1)]
Out[18]:
record_id month day year plot_id species_id sex hindfoot_length weight
0 1 7 16 1977 2 NL M 32.0 NaN
1 2 7 16 1977 3 NL M 33.0 NaN
2 3 7 16 1977 2 DM F 37.0 NaN
3 4 7 16 1977 7 DM M 36.0 NaN
4 5 7 16 1977 3 DM M 35.0 NaN
... ... ... ... ... ... ... ... ... ...
35530 35531 12 31 2002 13 PB F 27.0 NaN
35543 35544 12 31 2002 15 US NaN NaN NaN
35544 35545 12 31 2002 15 AH NaN NaN NaN
35545 35546 12 31 2002 15 AH NaN NaN NaN
35548 35549 12 31 2002 5 NaN NaN NaN NaN

4873 rows × 9 columns

In [20]:
# What does this do?
empty_weights = surveys_df[pd.isnull(surveys_df['weight'])]['weight']
print(empty_weights)
0       NaN
1       NaN
2       NaN
3       NaN
4       NaN
         ..
35530   NaN
35543   NaN
35544   NaN
35545   NaN
35548   NaN
Name: weight, Length: 3266, dtype: float64
In [ ]:
# CHALLENGE : Putting it all together
# 1. Create a new DataFrame that only contains observations with sex values that are not female or male. 
# Assign each sex value in the new DataFrame to a new value of ‘x’. Determine the number of null values in the subset.
In [21]:
empty_sex = surveys_df[pd.isnull(surveys_df['sex'])]['sex']
print(empty_sex)
13       NaN
18       NaN
33       NaN
56       NaN
76       NaN
        ... 
35527    NaN
35543    NaN
35544    NaN
35545    NaN
35548    NaN
Name: sex, Length: 2511, dtype: object
In [ ]:
# not sure how to assign each sex value with a new value 
# there are 2511 samples with no sex assigned 
In [ ]:
# 2. Create a new DataFrame that contains only observations that are of sex male or female and where weight values 
# are greater than 0. Create a stacked bar plot of average weight by plot with male vs female values stacked for 
# each plot.
In [38]:
weights = surveys_df[surveys_df['sex'].isin(["M","F"]) & (surveys_df.weight > 0)]

print(weights)
       record_id  month  day  year  plot_id species_id sex  hindfoot_length  \
62            63      8   19  1977        3         DM   M             35.0   
63            64      8   19  1977        7         DM   M             37.0   
64            65      8   19  1977        4         DM   F             34.0   
65            66      8   19  1977        4         DM   F             35.0   
66            67      8   19  1977        7         DM   M             35.0   
...          ...    ...  ...   ...      ...        ...  ..              ...   
35540      35541     12   31  2002       15         PB   F             24.0   
35541      35542     12   31  2002       15         PB   F             26.0   
35542      35543     12   31  2002       15         PB   F             27.0   
35546      35547     12   31  2002       10         RM   F             15.0   
35547      35548     12   31  2002        7         DO   M             36.0   

       weight  
62       40.0  
63       48.0  
64       29.0  
65       46.0  
66       36.0  
...       ...  
35540    31.0  
35541    29.0  
35542    34.0  
35546    14.0  
35547    51.0  

[32182 rows x 9 columns]
In [43]:
weights_site = weights.groupby(['plot_id', 'sex'])
In [44]:
weights_site_means = weights_site["weight"].mean()
In [45]:
print(weights_site_means)
plot_id  sex
1        F      46.311138
         M      55.950560
2        F      52.561845
         M      51.391382
3        F      31.215349
         M      34.163241
4        F      46.818824
         M      48.888119
5        F      40.974806
         M      40.708551
6        F      36.352288
         M      36.867388
7        F      20.006135
         M      21.194719
8        F      45.623011
         M      49.641372
9        F      53.618469
         M      49.519309
10       F      17.094203
         M      19.971223
11       F      43.515075
         M      43.366197
12       F      49.831731
         M      48.909710
13       F      40.524590
         M      40.097754
14       F      47.355491
         M      45.159378
15       F      26.670236
         M      27.523691
16       F      25.810427
         M      23.811321
17       F      48.176201
         M      47.558853
18       F      36.963514
         M      43.546952
19       F      21.978599
         M      20.306878
20       F      52.624406
         M      44.197279
21       F      25.974832
         M      22.772622
22       F      53.647059
         M      54.572531
23       F      20.564417
         M      18.941463
24       F      47.914405
         M      39.321503
Name: weight, dtype: float64
In [46]:
weights_plot = weights_site_means.unstack() 
In [47]:
weights_plot.plot(kind='bar', stacked=True, title="Average weights by site")
Out[47]:
<matplotlib.axes._subplots.AxesSubplot at 0x351df1610>