%cd /Users/alisonfowler/Docs/Jupityr-notebooks/

/Users/alisonfowler/Docs/Jupityr-notebooks

import pandas as pd

surveys_df = pd.read_csv("data/surveys.csv")

# Selecting data using labels (columns)

# Method 1: select a 'subset' of the data using the column name
surveys_df['species_id']

0         NL
1         NL
2         DM
3         DM
4         DM
        ... 
35544     AH
35545     AH
35546     RM
35547     DO
35548    NaN
Name: species_id, Length: 35549, dtype: object

# Method 2: use the column name as an 'attribute'; gives the same output
surveys_df.species_id

0         NL
1         NL
2         DM
3         DM
4         DM
        ... 
35544     AH
35545     AH
35546     RM
35547     DO
35548    NaN
Name: species_id, Length: 35549, dtype: object

# Creates an object, surveys_species, that only contains the `species_id` column
surveys_species = surveys_df['species_id']

# Select the species and plot columns from the DataFrame
surveys_df[['species_id', 'plot_id']]

# What happens when you flip the order?
surveys_df[['plot_id', 'species_id']]

# What happens if you ask for a column that doesn't exist?
surveys_df['speciess']

---------------------------------------------------------------------------
KeyError                                  Traceback (most recent call last)
~/opt/anaconda3/lib/python3.8/site-packages/pandas/core/indexes/base.py in get_loc(self, key, method, tolerance)
   2645             try:
-> 2646                 return self._engine.get_loc(key)
   2647             except KeyError:

pandas/_libs/index.pyx in pandas._libs.index.IndexEngine.get_loc()

pandas/_libs/index.pyx in pandas._libs.index.IndexEngine.get_loc()

pandas/_libs/hashtable_class_helper.pxi in pandas._libs.hashtable.PyObjectHashTable.get_item()

pandas/_libs/hashtable_class_helper.pxi in pandas._libs.hashtable.PyObjectHashTable.get_item()

KeyError: 'speciess'

During handling of the above exception, another exception occurred:

KeyError                                  Traceback (most recent call last)
<ipython-input-12-211eec9ce8b9> in <module>
      1 # What happens if you ask for a column that doesn't exist?
----> 2 surveys_df['speciess']

~/opt/anaconda3/lib/python3.8/site-packages/pandas/core/frame.py in __getitem__(self, key)
   2798             if self.columns.nlevels > 1:
   2799                 return self._getitem_multilevel(key)
-> 2800             indexer = self.columns.get_loc(key)
   2801             if is_integer(indexer):
   2802                 indexer = [indexer]

~/opt/anaconda3/lib/python3.8/site-packages/pandas/core/indexes/base.py in get_loc(self, key, method, tolerance)
   2646                 return self._engine.get_loc(key)
   2647             except KeyError:
-> 2648                 return self._engine.get_loc(self._maybe_cast_indexer(key))
   2649         indexer = self.get_indexer([key], method=method, tolerance=tolerance)
   2650         if indexer.ndim > 1 or indexer.size > 1:

pandas/_libs/index.pyx in pandas._libs.index.IndexEngine.get_loc()

pandas/_libs/index.pyx in pandas._libs.index.IndexEngine.get_loc()

pandas/_libs/hashtable_class_helper.pxi in pandas._libs.hashtable.PyObjectHashTable.get_item()

pandas/_libs/hashtable_class_helper.pxi in pandas._libs.hashtable.PyObjectHashTable.get_item()

KeyError: 'speciess'

# Python uses zero-based indexing, unlike R 
# Create a list of numbers:
a = [1, 2, 3, 4, 5]

# indexing: getting a specific element 
# slicing: selecting a set of elements 

# CHALLENGE 
# 1. What value does the code below return?
#   a[0]
# 2. How about this:
#   a[5]
# 3. In the example above, calling a[5] returns an error. Why is that?
# 4. What about?
#   a[len(a)]

a[0]

1

a[5] # error because there are only spots 0-4

---------------------------------------------------------------------------
IndexError                                Traceback (most recent call last)
<ipython-input-15-4a84d856522b> in <module>
----> 1 a[5]

IndexError: list index out of range

a[len(a)]

---------------------------------------------------------------------------
IndexError                                Traceback (most recent call last)
<ipython-input-16-836f840eaf9f> in <module>
----> 1 a[len(a)]

IndexError: list index out of range

# Slicing subsets of rows in Python

# Select rows 0, 1, 2 (row 3 is not selected)
surveys_df[0:3]

# Select the first 5 rows (rows 0, 1, 2, 3, 4)
surveys_df[:5]

# Select the last element in the list
# (the slice starts at the last element, and ends at the end of the list)
surveys_df[-1:]

# Copying vs referencing objects

# Using the 'copy() method'
true_copy_surveys_df = surveys_df.copy()
# this creates a copy of the dataframe, resulting in two objects

# Using the '=' operator
ref_surveys_df = surveys_df
# this does not create a copy - there is really only one object here, but now there are two ways to refer to it

# Assign the value `0` to the first three rows of data in the DataFrame
ref_surveys_df[0:3] = 0

# ref_surveys_df was created using the '=' operator
ref_surveys_df.head()

# surveys_df is the original dataframe
surveys_df.head()

# it also got modified!

# read in the original csv again
surveys_df = pd.read_csv("data/surveys.csv")

# Slicing subsets of rows and columns 
# loc is primarily label based indexing. Integers may be used but they are interpreted as a label.
# iloc is primarily integer based indexing

# iloc[row slicing, column slicing]
surveys_df.iloc[0:3, 1:4]

# Notice that we asked for a slice from 0:3. This yielded 3 rows of data. 
# When you ask for 0:3, you are actually telling Python to start at index 0 and select rows 0, 1, 2 up to but 
# not including 3.

# Select all columns for rows of index values 0 and 10
surveys_df.loc[[0, 10], :]

# What does this do?
surveys_df.loc[0, ['species_id', 'plot_id', 'weight']]

# pulls out the data in the first row of these three columns

species_id     NL
plot_id         2
weight        NaN
Name: 0, dtype: object

# What happens when you type the code below?
surveys_df.loc[[0, 10, 35549], :]

# error came up because we asked for a number (35549) that is out of range, I think?

---------------------------------------------------------------------------
KeyError                                  Traceback (most recent call last)
<ipython-input-29-b85de876aaec> in <module>
      1 # What happens when you type the code below?
----> 2 surveys_df.loc[[0, 10, 35549], :]

~/opt/anaconda3/lib/python3.8/site-packages/pandas/core/indexing.py in __getitem__(self, key)
   1760                 except (KeyError, IndexError, AttributeError):
   1761                     pass
-> 1762             return self._getitem_tuple(key)
   1763         else:
   1764             # we by definition only have the 0th axis

~/opt/anaconda3/lib/python3.8/site-packages/pandas/core/indexing.py in _getitem_tuple(self, tup)
   1287                 continue
   1288 
-> 1289             retval = getattr(retval, self.name)._getitem_axis(key, axis=i)
   1290 
   1291         return retval

~/opt/anaconda3/lib/python3.8/site-packages/pandas/core/indexing.py in _getitem_axis(self, key, axis)
   1952                     raise ValueError("Cannot index with multidimensional key")
   1953 
-> 1954                 return self._getitem_iterable(key, axis=axis)
   1955 
   1956             # nested tuple slicing

~/opt/anaconda3/lib/python3.8/site-packages/pandas/core/indexing.py in _getitem_iterable(self, key, axis)
   1593         else:
   1594             # A collection of keys
-> 1595             keyarr, indexer = self._get_listlike_indexer(key, axis, raise_missing=False)
   1596             return self.obj._reindex_with_indexers(
   1597                 {axis: [keyarr, indexer]}, copy=True, allow_dups=True

~/opt/anaconda3/lib/python3.8/site-packages/pandas/core/indexing.py in _get_listlike_indexer(self, key, axis, raise_missing)
   1550             keyarr, indexer, new_indexer = ax._reindex_non_unique(keyarr)
   1551 
-> 1552         self._validate_read_indexer(
   1553             keyarr, indexer, o._get_axis_number(axis), raise_missing=raise_missing
   1554         )

~/opt/anaconda3/lib/python3.8/site-packages/pandas/core/indexing.py in _validate_read_indexer(self, key, indexer, axis, raise_missing)
   1652             # just raising
   1653             if not (ax.is_categorical() or ax.is_interval()):
-> 1654                 raise KeyError(
   1655                     "Passing list-likes to .loc or [] with any missing labels "
   1656                     "is no longer supported, see "

KeyError: 'Passing list-likes to .loc or [] with any missing labels is no longer supported, see https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#deprecate-loc-reindex-listlike'

# Indexing by labels loc differs from indexing by integers iloc. 

# With loc, both the start bound and the stop bound are inclusive. 
# When using loc, integers can be used, but the integers refer to the index label and not the position. 

# For example, using loc and select 1:4 will get a different result than using iloc to select rows 1:4.

# Syntax for iloc indexing to finding a specific data element
# dat.iloc[row, column]

surveys_df.iloc[2, 6]

'F'

# CHALLENGE : RANGE 

# 1. What happens when you execute:

# surveys_df[0:1]
# surveys_df[:4]
# surveys_df[:-1]

# 2. What happens when you call:

# surveys_df.iloc[0:4, 1:4]
# surveys_df.loc[0:4, 1:4]

# 3. How are the two commands different?

surveys_df[0:1] # first row, all columns 
# so I think if we don't put a comma, it assumes all columns? (Unlike R)

surveys_df[:4] # first four rows, all columns

surveys_df[:-1] # all rows except the last row, all columns

# Notes from: https://realpython.com/lessons/indexing-and-slicing/ 
# If a is a list, then a[m:n] returns the portion of a: 
# Starting with postion m
# Up to but not including n

# Omitting the first index a[:n] starts the slice at the beginning of the list.
# Omitting the last index a[m:] extends the slice from the first index m to the end of the list.
# Omitting both indexes a[:] returns a copy of the entire list, but unlike with a string, it’s a copy, 
# not a reference to the same object.

surveys_df.iloc[0:4, 1:4] 
# first four rows, and 2nd, 3rd, and 4th columns 
# (up until but not including 5th (position 4) column)

surveys_df.loc[0:4, 1:4] 
# I think this returns an error because none of the columns are named by these integers? 
# Maybe it would work if we used the words "month", etc. ?

---------------------------------------------------------------------------
TypeError                                 Traceback (most recent call last)
<ipython-input-7-a48f5f17519f> in <module>
----> 1 surveys_df.loc[0:4, 1:4]
      2 # I think this returns an error because none of the columns are named by these integers?
      3 # Maybe it would work if we used the words "month", etc. ?

~/opt/anaconda3/lib/python3.8/site-packages/pandas/core/indexing.py in __getitem__(self, key)
   1760                 except (KeyError, IndexError, AttributeError):
   1761                     pass
-> 1762             return self._getitem_tuple(key)
   1763         else:
   1764             # we by definition only have the 0th axis

~/opt/anaconda3/lib/python3.8/site-packages/pandas/core/indexing.py in _getitem_tuple(self, tup)
   1287                 continue
   1288 
-> 1289             retval = getattr(retval, self.name)._getitem_axis(key, axis=i)
   1290 
   1291         return retval

~/opt/anaconda3/lib/python3.8/site-packages/pandas/core/indexing.py in _getitem_axis(self, key, axis)
   1910         if isinstance(key, slice):
   1911             self._validate_key(key, axis)
-> 1912             return self._get_slice_axis(key, axis=axis)
   1913         elif com.is_bool_indexer(key):
   1914             return self._getbool_axis(key, axis=axis)

~/opt/anaconda3/lib/python3.8/site-packages/pandas/core/indexing.py in _get_slice_axis(self, slice_obj, axis)
   1794 
   1795         labels = obj._get_axis(axis)
-> 1796         indexer = labels.slice_indexer(
   1797             slice_obj.start, slice_obj.stop, slice_obj.step, kind=self.name
   1798         )

~/opt/anaconda3/lib/python3.8/site-packages/pandas/core/indexes/base.py in slice_indexer(self, start, end, step, kind)
   4711         slice(1, 3)
   4712         """
-> 4713         start_slice, end_slice = self.slice_locs(start, end, step=step, kind=kind)
   4714 
   4715         # return a slice

~/opt/anaconda3/lib/python3.8/site-packages/pandas/core/indexes/base.py in slice_locs(self, start, end, step, kind)
   4924         start_slice = None
   4925         if start is not None:
-> 4926             start_slice = self.get_slice_bound(start, "left", kind)
   4927         if start_slice is None:
   4928             start_slice = 0

~/opt/anaconda3/lib/python3.8/site-packages/pandas/core/indexes/base.py in get_slice_bound(self, label, side, kind)
   4836         # For datetime indices label may be a string that has to be converted
   4837         # to datetime boundary according to its resolution.
-> 4838         label = self._maybe_cast_slice_bound(label, side, kind)
   4839 
   4840         # we need to look up the label

~/opt/anaconda3/lib/python3.8/site-packages/pandas/core/indexes/base.py in _maybe_cast_slice_bound(self, label, side, kind)
   4788         # this is rejected (generally .loc gets you here)
   4789         elif is_integer(label):
-> 4790             self._invalid_indexer("slice", label)
   4791 
   4792         return label

~/opt/anaconda3/lib/python3.8/site-packages/pandas/core/indexes/base.py in _invalid_indexer(self, form, key)
   3074         Consistent invalid indexer message.
   3075         """
-> 3076         raise TypeError(
   3077             f"cannot do {form} indexing on {type(self)} with these "
   3078             f"indexers [{key}] of {type(key)}"

TypeError: cannot do slice indexing on <class 'pandas.core.indexes.base.Index'> with these indexers [1] of <class 'int'>

surveys_df.loc[0:4, "month":"year"] # I wonder if that was the problem?

# Subsetting data using criteria 
surveys_df[surveys_df.year == 2002] # get only samples collected in 2002

surveys_df[surveys_df.year != 2002] # all years except 2002

surveys_df[(surveys_df.year >= 1980) & (surveys_df.year <= 1985)] # between 1980-1985

# Python syntax cheat sheet 
# Equals: ==
# Not equals: !=
# Greater than, less than: > or <
# Greater than or equal to >=
# Less than or equal to <=

# CHALLENGE : QUERIES 
# 1. Select a subset of rows in the surveys_df DataFrame that contain data from the year 1999 and that contain 
# weight values less than or equal to 8. How many rows did you end up with? What did your neighbor get?

# 2. You can use the isin command in Python to query a DataFrame based upon a list of values as follows:
# surveys_df[surveys_df['species_id'].isin([listGoesHere])]

# Use the isin function to find all plots that contain particular species in the “surveys” DataFrame. 
# How many records contain these values?

# Experiment with other queries. Create a query that finds all rows with a weight value > or equal to 0.
# The ~ symbol in Python can be used to return the OPPOSITE of the selection that you specify in Python. 
# It is equivalent to is not in. Write a query that selects all rows with sex NOT equal to ‘M’ or ‘F’ in the “surveys” data.

surveys_df[(surveys_df.year == 1999) & (surveys_df.weight <= 8)] # five rows returned

surveys_df[surveys_df['species_id'].isin(['DO'])] # returns all rows with this species ID

surveys_df[(surveys_df.weight >= 0)] # weights greater than or equal to zero

surveys_df[~surveys_df['sex'].isin(["M","F"])] # returns rows neither M or F

# Using masks to identify a specific condition

# Set x to 5
x = 5

# What does the code below return?
x > 5

False

# How about this?
x == 5

True

# Creating a Boolean mask
# We can set the True / False criteria & then python will create an output object like the original object but 
# with True or False for each index location

pd.isnull(surveys_df)

# To select just the rows with NaN values, we can use the 'any()' method
surveys_df[pd.isnull(surveys_df).any(axis=1)]

# What does this do?
empty_weights = surveys_df[pd.isnull(surveys_df['weight'])]['weight']
print(empty_weights)

0       NaN
1       NaN
2       NaN
3       NaN
4       NaN
         ..
35530   NaN
35543   NaN
35544   NaN
35545   NaN
35548   NaN
Name: weight, Length: 3266, dtype: float64

# CHALLENGE : Putting it all together
# 1. Create a new DataFrame that only contains observations with sex values that are not female or male. 
# Assign each sex value in the new DataFrame to a new value of ‘x’. Determine the number of null values in the subset.

empty_sex = surveys_df[pd.isnull(surveys_df['sex'])]['sex']
print(empty_sex)

13       NaN
18       NaN
33       NaN
56       NaN
76       NaN
        ... 
35527    NaN
35543    NaN
35544    NaN
35545    NaN
35548    NaN
Name: sex, Length: 2511, dtype: object

# not sure how to assign each sex value with a new value 
# there are 2511 samples with no sex assigned

# 2. Create a new DataFrame that contains only observations that are of sex male or female and where weight values 
# are greater than 0. Create a stacked bar plot of average weight by plot with male vs female values stacked for 
# each plot.

weights = surveys_df[surveys_df['sex'].isin(["M","F"]) & (surveys_df.weight > 0)]

print(weights)

       record_id  month  day  year  plot_id species_id sex  hindfoot_length  \
62            63      8   19  1977        3         DM   M             35.0   
63            64      8   19  1977        7         DM   M             37.0   
64            65      8   19  1977        4         DM   F             34.0   
65            66      8   19  1977        4         DM   F             35.0   
66            67      8   19  1977        7         DM   M             35.0   
...          ...    ...  ...   ...      ...        ...  ..              ...   
35540      35541     12   31  2002       15         PB   F             24.0   
35541      35542     12   31  2002       15         PB   F             26.0   
35542      35543     12   31  2002       15         PB   F             27.0   
35546      35547     12   31  2002       10         RM   F             15.0   
35547      35548     12   31  2002        7         DO   M             36.0   

       weight  
62       40.0  
63       48.0  
64       29.0  
65       46.0  
66       36.0  
...       ...  
35540    31.0  
35541    29.0  
35542    34.0  
35546    14.0  
35547    51.0  

[32182 rows x 9 columns]

weights_site = weights.groupby(['plot_id', 'sex'])

weights_site_means = weights_site["weight"].mean()

print(weights_site_means)

plot_id  sex
1        F      46.311138
         M      55.950560
2        F      52.561845
         M      51.391382
3        F      31.215349
         M      34.163241
4        F      46.818824
         M      48.888119
5        F      40.974806
         M      40.708551
6        F      36.352288
         M      36.867388
7        F      20.006135
         M      21.194719
8        F      45.623011
         M      49.641372
9        F      53.618469
         M      49.519309
10       F      17.094203
         M      19.971223
11       F      43.515075
         M      43.366197
12       F      49.831731
         M      48.909710
13       F      40.524590
         M      40.097754
14       F      47.355491
         M      45.159378
15       F      26.670236
         M      27.523691
16       F      25.810427
         M      23.811321
17       F      48.176201
         M      47.558853
18       F      36.963514
         M      43.546952
19       F      21.978599
         M      20.306878
20       F      52.624406
         M      44.197279
21       F      25.974832
         M      22.772622
22       F      53.647059
         M      54.572531
23       F      20.564417
         M      18.941463
24       F      47.914405
         M      39.321503
Name: weight, dtype: float64

weights_plot = weights_site_means.unstack()

weights_plot.plot(kind='bar', stacked=True, title="Average weights by site")

<matplotlib.axes._subplots.AxesSubplot at 0x351df1610>

	species_id	plot_id
0	NL	2
1	NL	3
2	DM	2
3	DM	7
4	DM	3
...	...	...
35544	AH	15
35545	AH	15
35546	RM	10
35547	DO	7
35548	NaN	5

	record_id	month	day	year	plot_id	species_id	sex	hindfoot_length	weight
0	1	7	16	1977	2	NL	M	32.0	NaN
1	2	7	16	1977	3	NL	M	33.0	NaN
2	3	7	16	1977	2	DM	F	37.0	NaN

	record_id	month	day	year	plot_id	species_id	sex	hindfoot_length	weight
0	0	0	0	0	0	0	0	0.0	0.0
1	0	0	0	0	0	0	0	0.0	0.0
2	0	0	0	0	0	0	0	0.0	0.0
3	4	7	16	1977	7	DM	M	36.0	NaN
4	5	7	16	1977	3	DM	M	35.0	NaN

	record_id	month	day	year	plot_id	species_id	sex	hindfoot_length	weight
33320	33321	1	12	2002	1	DM	M	38.0	44.0
33321	33322	1	12	2002	1	DO	M	37.0	58.0
33322	33323	1	12	2002	1	PB	M	28.0	45.0
33323	33324	1	12	2002	1	AB	NaN	NaN	NaN
33324	33325	1	12	2002	1	DO	M	35.0	29.0
...	...	...	...	...	...	...	...	...	...
35544	35545	12	31	2002	15	AH	NaN	NaN	NaN
35545	35546	12	31	2002	15	AH	NaN	NaN	NaN
35546	35547	12	31	2002	10	RM	F	15.0	14.0
35547	35548	12	31	2002	7	DO	M	36.0	51.0
35548	35549	12	31	2002	5	NaN	NaN	NaN	NaN

	record_id	month	day	year	plot_id	species_id	sex	hindfoot_length	weight
2270	2271	1	15	1980	8	DO	M	35.0	53.0
2271	2272	1	15	1980	11	PF	F	16.0	10.0
2272	2273	1	15	1980	18	DM	F	34.0	33.0
2273	2274	1	15	1980	11	DM	M	38.0	37.0
2274	2275	1	15	1980	8	DO	F	33.0	29.0
...	...	...	...	...	...	...	...	...	...
11222	11223	12	8	1985	4	DM	M	36.0	40.0
11223	11224	12	8	1985	11	DM	M	37.0	49.0
11224	11225	12	8	1985	7	PE	M	20.0	18.0
11225	11226	12	8	1985	1	DM	M	38.0	47.0
11226	11227	12	8	1985	15	NaN	NaN	NaN	NaN

	record_id	month	day	year	plot_id	species_id	sex	hindfoot_length	weight
29082	29083	1	16	1999	21	RM	M	16.0	8.0
29196	29197	2	20	1999	18	RM	M	18.0	8.0
29421	29422	3	15	1999	16	RM	M	15.0	8.0
29903	29904	10	10	1999	4	PP	M	20.0	7.0
29905	29906	10	10	1999	4	PP	M	21.0	4.0

	record_id	month	day	year	plot_id	species_id	sex	hindfoot_length	weight
67	68	8	19	1977	8	DO	F	32.0	52.0
291	292	10	17	1977	3	DO	F	36.0	33.0
293	294	10	17	1977	3	DO	F	37.0	50.0
316	317	10	17	1977	17	DO	F	32.0	48.0
322	323	10	17	1977	17	DO	F	33.0	31.0
...	...	...	...	...	...	...	...	...	...
35502	35503	12	31	2002	6	DO	M	33.0	32.0
35515	35516	12	31	2002	11	DO	F	35.0	52.0
35517	35518	12	31	2002	11	DO	M	36.0	38.0
35526	35527	12	31	2002	13	DO	F	33.0	43.0
35547	35548	12	31	2002	7	DO	M	36.0	51.0

	record_id	month	day	year	plot_id	species_id	sex	hindfoot_length	weight
0	False	False	False	False	False	False	False	False	True
1	False	False	False	False	False	False	False	False	True
2	False	False	False	False	False	False	False	False	True
3	False	False	False	False	False	False	False	False	True
4	False	False	False	False	False	False	False	False	True
...	...	...	...	...	...	...	...	...	...
35544	False	False	False	False	False	False	True	True	True
35545	False	False	False	False	False	False	True	True	True
35546	False	False	False	False	False	False	False	False	False
35547	False	False	False	False	False	False	False	False	False
35548	False	False	False	False	False	True	True	True	True