In [1]:
import pandas as pd
In [5]:
%pwd
Out[5]:
'/Users/alisonfowler'
In [15]:
# downloaded survey data; now need to change working directory to retrieve it
%cd /Users/alisonfowler/Docs/Jupityr-notebooks/
/Users/alisonfowler/Docs/Jupityr-notebooks
In [16]:
# Note that pd.read_csv is used because we imported pandas as pd
pd.read_csv("data/surveys.csv")
Out[16]:
record_id month day year plot_id species_id sex hindfoot_length weight
0 1 7 16 1977 2 NL M 32.0 NaN
1 2 7 16 1977 3 NL M 33.0 NaN
2 3 7 16 1977 2 DM F 37.0 NaN
3 4 7 16 1977 7 DM M 36.0 NaN
4 5 7 16 1977 3 DM M 35.0 NaN
... ... ... ... ... ... ... ... ... ...
35544 35545 12 31 2002 15 AH NaN NaN NaN
35545 35546 12 31 2002 15 AH NaN NaN NaN
35546 35547 12 31 2002 10 RM F 15.0 14.0
35547 35548 12 31 2002 7 DO M 36.0 51.0
35548 35549 12 31 2002 5 NaN NaN NaN NaN

35549 rows × 9 columns

In [17]:
surveys_df = pd.read_csv("data/surveys.csv")
In [18]:
surveys_df
Out[18]:
record_id month day year plot_id species_id sex hindfoot_length weight
0 1 7 16 1977 2 NL M 32.0 NaN
1 2 7 16 1977 3 NL M 33.0 NaN
2 3 7 16 1977 2 DM F 37.0 NaN
3 4 7 16 1977 7 DM M 36.0 NaN
4 5 7 16 1977 3 DM M 35.0 NaN
... ... ... ... ... ... ... ... ... ...
35544 35545 12 31 2002 15 AH NaN NaN NaN
35545 35546 12 31 2002 15 AH NaN NaN NaN
35546 35547 12 31 2002 10 RM F 15.0 14.0
35547 35548 12 31 2002 7 DO M 36.0 51.0
35548 35549 12 31 2002 5 NaN NaN NaN NaN

35549 rows × 9 columns

In [19]:
surveys_df.head()
Out[19]:
record_id month day year plot_id species_id sex hindfoot_length weight
0 1 7 16 1977 2 NL M 32.0 NaN
1 2 7 16 1977 3 NL M 33.0 NaN
2 3 7 16 1977 2 DM F 37.0 NaN
3 4 7 16 1977 7 DM M 36.0 NaN
4 5 7 16 1977 3 DM M 35.0 NaN
In [20]:
type(surveys_df)
Out[20]:
pandas.core.frame.DataFrame
In [21]:
surveys_df.dtypes # the formats of different types of data
Out[21]:
record_id            int64
month                int64
day                  int64
year                 int64
plot_id              int64
species_id          object
sex                 object
hindfoot_length    float64
weight             float64
dtype: object
In [ ]:
# CHALLENGE : DATAFRAMES 
# 1. surveys_df.columns
# 2. surveys_df.shape Take note of the output of shape - what format does it return the shape of the DataFrame in?
# HINT: More on tuples, here.
# 3. surveys_df.head() Also, what does surveys_df.head(15) do?
# 4. surveys_df.tail()
In [22]:
surveys_df.columns # returns column names
Out[22]:
Index(['record_id', 'month', 'day', 'year', 'plot_id', 'species_id', 'sex',
       'hindfoot_length', 'weight'],
      dtype='object')
In [23]:
surveys_df.shape # dimensions - returns the shape as a vector
Out[23]:
(35549, 9)
In [41]:
surveys_df.head() # displays first five rows of the dataframe
Out[41]:
record_id month day year plot_id species_id sex hindfoot_length weight
0 1 7 16 1977 2 NL M 32.0 NaN
1 2 7 16 1977 3 NL M 33.0 NaN
2 3 7 16 1977 2 DM F 37.0 NaN
3 4 7 16 1977 7 DM M 36.0 NaN
4 5 7 16 1977 3 DM M 35.0 NaN
In [24]:
surveys_df.head(15) # displays first fifteen rows of the dataframe
Out[24]:
record_id month day year plot_id species_id sex hindfoot_length weight
0 1 7 16 1977 2 NL M 32.0 NaN
1 2 7 16 1977 3 NL M 33.0 NaN
2 3 7 16 1977 2 DM F 37.0 NaN
3 4 7 16 1977 7 DM M 36.0 NaN
4 5 7 16 1977 3 DM M 35.0 NaN
5 6 7 16 1977 1 PF M 14.0 NaN
6 7 7 16 1977 2 PE F NaN NaN
7 8 7 16 1977 1 DM M 37.0 NaN
8 9 7 16 1977 1 DM F 34.0 NaN
9 10 7 16 1977 6 PF F 20.0 NaN
10 11 7 16 1977 5 DS F 53.0 NaN
11 12 7 16 1977 7 DM M 38.0 NaN
12 13 7 16 1977 3 DM M 35.0 NaN
13 14 7 16 1977 8 DM NaN NaN NaN
14 15 7 16 1977 6 DM F 36.0 NaN
In [25]:
surveys_df.tail() # displays last five rows 
Out[25]:
record_id month day year plot_id species_id sex hindfoot_length weight
35544 35545 12 31 2002 15 AH NaN NaN NaN
35545 35546 12 31 2002 15 AH NaN NaN NaN
35546 35547 12 31 2002 10 RM F 15.0 14.0
35547 35548 12 31 2002 7 DO M 36.0 51.0
35548 35549 12 31 2002 5 NaN NaN NaN NaN
In [26]:
pd.unique(surveys_df['species_id']) # list all the unique species
Out[26]:
array(['NL', 'DM', 'PF', 'PE', 'DS', 'PP', 'SH', 'OT', 'DO', 'OX', 'SS',
       'OL', 'RM', nan, 'SA', 'PM', 'AH', 'DX', 'AB', 'CB', 'CM', 'CQ',
       'RF', 'PC', 'PG', 'PH', 'PU', 'CV', 'UR', 'UP', 'ZL', 'UL', 'CS',
       'SC', 'BA', 'SF', 'RO', 'AS', 'SO', 'PI', 'ST', 'CU', 'SU', 'RX',
       'PB', 'PL', 'PX', 'CT', 'US'], dtype=object)
In [ ]:
# CHALLENGE : STATISTICS 
# 1. Create a list of unique site ID’s (“plot_id”) found in the surveys data. Call it site_names. 
# How many unique sites are there in the data? How many unique species are in the data?

# 2. What is the difference between len(site_names) and surveys_df['plot_id'].nunique()?
In [27]:
site_names=pd.unique(surveys_df['plot_id']) # create a new object with site names
In [28]:
site_names # look at new object 
Out[28]:
array([ 2,  3,  7,  1,  6,  5,  8,  4, 11, 14, 15, 13,  9, 10, 17, 16, 20,
       23, 18, 21, 22, 19, 12, 24])
In [29]:
site_names.shape # how many sites are there 
Out[29]:
(24,)
In [30]:
species_names=pd.unique(surveys_df['species_id']) # create a new object with species names
In [31]:
species_names # look 
Out[31]:
array(['NL', 'DM', 'PF', 'PE', 'DS', 'PP', 'SH', 'OT', 'DO', 'OX', 'SS',
       'OL', 'RM', nan, 'SA', 'PM', 'AH', 'DX', 'AB', 'CB', 'CM', 'CQ',
       'RF', 'PC', 'PG', 'PH', 'PU', 'CV', 'UR', 'UP', 'ZL', 'UL', 'CS',
       'SC', 'BA', 'SF', 'RO', 'AS', 'SO', 'PI', 'ST', 'CU', 'SU', 'RX',
       'PB', 'PL', 'PX', 'CT', 'US'], dtype=object)
In [32]:
species_names.shape # how many species are there
Out[32]:
(49,)
In [70]:
len(species_names)
Out[70]:
49
In [33]:
len(site_names) # length of vector
Out[33]:
24
In [34]:
surveys_df['plot_id'].nunique() # number of unique values in the vector
# different from previous line b/c the vector could have repeating values. 
# in this case, it returns the same number since all values are unique 
Out[34]:
24
In [35]:
surveys_df['weight'].describe() # get basic stats for a given column 
Out[35]:
count    32283.000000
mean        42.672428
std         36.631259
min          4.000000
25%         20.000000
50%         37.000000
75%         48.000000
max        280.000000
Name: weight, dtype: float64
In [36]:
surveys_df['weight'].min() # extract a stat 
Out[36]:
4.0
In [37]:
# Groups 

# Group data by sex
grouped_data = surveys_df.groupby('sex')
In [38]:
# Summary statistics for all numeric columns by sex
grouped_data.describe()
Out[38]:
record_id month ... hindfoot_length weight
count mean std min 25% 50% 75% max count mean ... 75% max count mean std min 25% 50% 75% max
sex
F 15690.0 18036.412046 10423.089000 3.0 8917.50 18075.5 27250.00 35547.0 15690.0 6.583047 ... 36.0 64.0 15303.0 42.170555 36.847958 4.0 20.0 34.0 46.0 274.0
M 17348.0 17754.835601 10132.203323 1.0 8969.75 17727.5 26454.25 35548.0 17348.0 6.392668 ... 36.0 58.0 16879.0 42.995379 36.184981 4.0 20.0 39.0 49.0 280.0

2 rows × 56 columns

In [39]:
# Provide the mean for each numeric column by sex
grouped_data.mean()
Out[39]:
record_id month day year plot_id hindfoot_length weight
sex
F 18036.412046 6.583047 16.007138 1990.644997 11.440854 28.836780 42.170555
M 17754.835601 6.392668 16.184286 1990.480401 11.098282 29.709578 42.995379
In [ ]:
# CHALLENGE : SUMMARY DATA 
# 1. How many recorded individuals are female F and how many male M?
# 2. What happens when you group by two columns using the following syntax and then calculate mean values?
#   grouped_data2 = surveys_df.groupby(['plot_id', 'sex'])
#   grouped_data2.mean()
# 3. Summarize weight values for each site in your data. 
# HINT: you can use the following syntax to only create summary statistics for one column in your data. 
#   by_site['weight'].describe()
In [42]:
grouped_data.count() # number of recorded individuals
# females: 15,690
# males: 17,348 
# apparently some missing data for hindfoot length and weights? 
Out[42]:
record_id month day year plot_id species_id hindfoot_length weight
sex
F 15690 15690 15690 15690 15690 15690 14894 15303
M 17348 17348 17348 17348 17348 17348 16476 16879
In [48]:
grouped_data2 = surveys_df.groupby(['plot_id', 'sex']) # create a new object grouped by two variables
In [49]:
grouped_data2.mean() # displays mean for each variable grouped by sexes within each plot/site
Out[49]:
record_id month day year hindfoot_length weight
plot_id sex
1 F 18390.384434 6.597877 15.338443 1990.933962 31.733911 46.311138
M 17197.740639 6.121461 15.905936 1990.091324 34.302770 55.950560
2 F 17714.753608 6.426804 16.288660 1990.449485 30.161220 52.561845
M 18085.458042 6.340035 15.440559 1990.756119 30.353760 51.391382
3 F 19888.783875 6.604703 16.161254 1992.013438 23.774044 31.215349
M 20226.767857 6.271429 16.450000 1992.275000 23.833744 34.163241
4 F 17489.205275 6.442661 15.746560 1990.235092 33.249102 46.818824
M 18493.841748 6.430097 16.507767 1991.000971 34.097959 48.888119
5 F 12280.793169 6.142315 15.722960 1986.485769 28.921844 40.974806
M 12798.426621 6.194539 15.703072 1986.817406 29.694794 40.708551
6 F 19406.503392 6.628223 16.313433 1991.579376 26.981322 36.352288
M 17849.574607 6.035340 16.091623 1990.556283 27.425591 36.867388
7 F 19069.668657 6.385075 15.313433 1991.441791 19.779553 20.006135
M 19188.729642 6.719870 15.778502 1991.462541 20.536667 21.194719
8 F 18920.276190 6.632143 15.836905 1991.267857 32.187578 45.623011
M 19452.109868 6.571719 15.854527 1991.686673 33.751059 49.641372
9 F 16217.497069 6.499414 15.555686 1989.303634 35.126092 53.618469
M 18000.710159 6.361554 15.209163 1990.632470 34.175732 49.519309
10 F 16001.496454 5.588652 16.964539 1989.248227 18.641791 17.094203
M 15708.704225 5.718310 16.739437 1989.007042 19.567164 19.971223
11 F 16994.962287 6.759124 16.283455 1989.836983 32.029299 43.515075
M 16933.909621 6.374150 15.974733 1989.856171 32.078014 43.366197
12 F 17457.966981 6.509434 16.305660 1990.266981 30.975124 49.831731
M 17592.327500 6.304167 16.367500 1990.400833 31.762489 48.909710
13 F 18033.100318 6.802548 16.229299 1990.619427 27.201014 40.524590
M 16969.044700 6.480204 16.005109 1989.911877 27.893793 40.097754
14 F 17097.145275 6.510578 16.681241 1989.974612 32.973373 47.355491
M 17891.948598 6.660748 16.504673 1990.587850 32.961802 45.159378
15 F 20602.449064 6.569647 16.162162 1992.523909 21.949891 26.670236
M 18104.019560 6.185819 17.413203 1990.770171 21.803109 27.523691
16 F 19002.445946 6.360360 16.819820 1991.351351 23.144928 25.810427
M 18434.714286 6.201465 16.622711 1990.926740 23.480916 23.811321
17 F 18234.322870 6.650224 15.892377 1990.785874 30.918536 48.176201
M 18857.651472 6.569801 16.183286 1991.331434 32.227634 47.558853
18 F 17940.875497 6.698013 15.960265 1990.536424 26.690341 36.963514
M 15106.718850 6.610224 16.797125 1988.551118 27.703072 43.546952
19 F 21848.216475 6.701149 15.226054 1993.417625 21.257937 21.978599
M 19470.779690 6.533563 16.647160 1991.740103 21.071685 20.306878
20 F 17510.769231 6.743077 16.026154 1990.253846 27.069193 52.624406
M 16076.192496 6.489396 16.375204 1989.243067 27.908451 44.197279
21 F 22452.636661 6.860884 16.307692 1993.878887 22.366554 25.974832
M 20120.399113 6.671840 16.203991 1992.199557 21.736721 22.772622
22 F 18499.695976 6.651267 15.521610 1990.973174 34.108320 53.647059
M 18015.365527 6.381872 16.682021 1990.650817 33.359746 54.572531
23 F 15863.193939 6.860606 16.036364 1989.024242 20.051948 20.564417
M 17091.338164 6.391304 16.077295 1989.961353 19.850000 18.941463
24 F 13702.224280 6.596708 16.393004 1987.485597 26.993377 47.914405
M 15208.136082 6.360825 16.971134 1988.641237 25.786996 39.321503
In [52]:
grouped_data2['weight'].describe() # gives information about weights within groups
Out[52]:
count mean std min 25% 50% 75% max
plot_id sex
1 F 826.0 46.311138 33.240958 5.0 26.00 40.0 50.00 196.0
M 1072.0 55.950560 41.035686 4.0 37.00 46.0 54.00 231.0
2 F 954.0 52.561845 45.547697 5.0 25.00 40.0 51.00 274.0
M 1114.0 51.391382 46.690887 5.0 24.00 42.0 50.00 278.0
3 F 873.0 31.215349 30.687451 4.0 15.00 23.0 34.00 199.0
M 827.0 34.163241 40.260426 5.0 13.00 23.0 39.00 250.0
4 F 850.0 46.818824 33.560664 5.0 28.00 40.0 47.00 200.0
M 1010.0 48.888119 32.254168 4.0 32.00 44.5 50.00 187.0
5 F 516.0 40.974806 36.396966 5.0 21.00 35.0 45.00 248.0
M 573.0 40.708551 31.250967 6.0 21.00 40.0 49.00 240.0
6 F 721.0 36.352288 29.513333 5.0 19.00 29.0 41.00 188.0
M 739.0 36.867388 30.867779 6.0 18.00 31.0 46.00 241.0
7 F 326.0 20.006135 17.895937 6.0 12.00 17.0 23.00 170.0
M 303.0 21.194719 23.971252 4.0 11.00 17.0 23.00 235.0
8 F 817.0 45.623011 31.045426 5.0 25.00 42.0 50.00 178.0
M 962.0 49.641372 34.820355 5.0 29.00 45.0 52.00 173.0
9 F 823.0 53.618469 35.572793 6.0 35.00 43.0 54.00 177.0
M 984.0 49.519309 31.888023 6.0 37.00 46.0 50.00 275.0
10 F 138.0 17.094203 14.074820 7.0 10.00 13.0 20.00 130.0
M 139.0 19.971223 25.061068 4.0 10.00 12.0 22.00 237.0
11 F 796.0 43.515075 29.627049 5.0 27.00 40.0 46.00 208.0
M 994.0 43.366197 28.425105 6.0 25.00 43.0 49.00 212.0
12 F 1040.0 49.831731 43.790247 6.0 26.00 41.0 48.25 264.0
M 1174.0 48.909710 39.301038 7.0 25.25 43.0 50.00 280.0
13 F 610.0 40.524590 36.109806 5.0 21.00 31.0 42.00 192.0
M 757.0 40.097754 31.753448 6.0 20.00 34.0 47.00 241.0
14 F 692.0 47.355491 29.563455 5.0 37.00 43.0 48.00 211.0
M 1029.0 45.159378 25.272173 5.0 35.00 44.0 50.00 222.0
15 F 467.0 26.670236 31.983137 4.0 12.50 18.0 26.00 198.0
M 401.0 27.523691 38.631271 4.0 10.00 18.0 25.00 259.0
16 F 211.0 25.810427 20.902314 4.0 13.00 21.0 31.00 158.0
M 265.0 23.811321 14.663726 5.0 11.00 20.0 35.00 61.0
17 F 874.0 48.176201 37.485528 6.0 27.00 41.0 49.00 192.0
M 1011.0 47.558853 34.082010 4.0 27.00 45.0 51.00 216.0
18 F 740.0 36.963514 35.184417 5.0 17.00 28.5 40.00 212.0
M 607.0 43.546952 41.864279 7.0 18.00 33.0 48.00 256.0
19 F 514.0 21.978599 14.008822 6.0 12.00 20.0 29.00 139.0
M 567.0 20.306878 12.553954 4.0 10.00 19.0 25.00 100.0
20 F 631.0 52.624406 55.257665 5.0 17.00 30.0 48.00 220.0
M 588.0 44.197279 43.361503 5.0 17.00 34.0 47.00 223.0
21 F 596.0 25.974832 22.619863 4.0 11.00 24.0 31.00 188.0
M 431.0 22.772622 18.984554 4.0 9.00 19.0 32.00 190.0
22 F 646.0 53.647059 38.588538 5.0 29.00 39.0 54.00 161.0
M 648.0 54.572531 38.841066 6.0 31.00 44.0 53.00 212.0
23 F 163.0 20.564417 18.933945 8.0 12.00 16.0 23.00 199.0
M 205.0 18.941463 17.979740 4.0 10.00 12.0 22.00 131.0
24 F 479.0 47.914405 49.112574 6.0 21.00 33.0 44.00 251.0
M 479.0 39.321503 42.003947 4.0 17.00 24.0 45.00 230.0
In [53]:
# group by site only
grouped_data3 = surveys_df.groupby('plot_id')
In [59]:
grouped_data3.mean()
Out[59]:
record_id month day year hindfoot_length weight
plot_id
1 17674.516792 6.307769 15.646617 1990.428070 33.186760 51.822911
2 17669.809480 6.378760 15.760711 1990.429353 30.259519 52.251688
3 19768.905361 6.470460 16.266958 1991.931072 23.803327 32.654386
4 17998.813611 6.424073 16.165058 1990.625190 33.697636 47.928189
5 13133.017588 6.201005 15.813233 1987.082077 29.322306 40.947802
6 18579.403287 6.307206 16.252212 1991.035398 27.201129 36.738893
7 18080.537990 6.580882 15.511029 1990.680147 20.167203 20.663009
8 19161.210999 6.572713 15.878371 1991.467478 33.025143 47.758001
9 17295.930785 6.423554 15.409607 1990.107438 34.594318 51.432358
10 16180.501066 6.093817 16.377399 1989.330490 19.088561 18.541219
11 17002.651199 6.541710 16.132951 1989.872784 32.054679 43.451757
12 17383.576744 6.369979 16.253700 1990.229598 31.406425 49.496169
13 17414.790637 6.604031 16.135891 1990.201560 27.570887 40.445660
14 17670.336340 6.583554 16.563395 1990.410610 32.969430 46.277199
15 19172.228251 6.458372 16.738073 1991.507016 21.882840 27.042578
16 17776.507740 6.311146 16.910217 1990.462848 23.281184 24.585417
17 18572.008337 6.599313 15.989210 1991.078470 31.621505 47.889593
18 16615.919723 6.634602 16.281661 1989.605536 27.157242 40.005922
19 20042.891505 6.617325 16.070648 1992.127839 21.164474 21.105166
20 16796.862590 6.581295 16.184892 1989.754676 27.460034 48.665303
21 20764.940324 6.747656 16.262575 1992.654731 22.103314 24.627794
22 18185.315225 6.498928 16.135811 1990.763402 33.734854 54.146379
23 15881.472855 6.359019 16.353765 1989.110333 19.969101 19.634146
24 14718.620229 6.447519 16.643130 1988.257634 26.384444 43.679167
In [60]:
grouped_data3['weight'].describe() # gives information about weights within sites
# but this output looks slightly different from the one on the tutorial site, which has the columns & rows switched
Out[60]:
count mean std min 25% 50% 75% max
plot_id
1 1903.0 51.822911 38.176670 4.0 30.0 44.0 53.0 231.0
2 2074.0 52.251688 46.503602 5.0 24.0 41.0 50.0 278.0
3 1710.0 32.654386 35.641630 4.0 14.0 23.0 36.0 250.0
4 1866.0 47.928189 32.886598 4.0 30.0 43.0 50.0 200.0
5 1092.0 40.947802 34.086616 5.0 21.0 37.0 48.0 248.0
6 1463.0 36.738893 30.648310 5.0 18.0 30.0 45.0 243.0
7 638.0 20.663009 21.315325 4.0 11.0 17.0 23.0 235.0
8 1781.0 47.758001 33.192194 5.0 26.0 44.0 51.0 178.0
9 1811.0 51.432358 33.724726 6.0 36.0 45.0 50.0 275.0
10 279.0 18.541219 20.290806 4.0 10.0 12.0 21.0 237.0
11 1793.0 43.451757 28.975514 5.0 26.0 42.0 48.0 212.0
12 2219.0 49.496169 41.630035 6.0 26.0 42.0 50.0 280.0
13 1371.0 40.445660 34.042767 5.0 20.5 33.0 45.0 241.0
14 1728.0 46.277199 27.570389 5.0 36.0 44.0 49.0 222.0
15 869.0 27.042578 35.178142 4.0 11.0 18.0 26.0 259.0
16 480.0 24.585417 17.682334 4.0 12.0 20.0 34.0 158.0
17 1893.0 47.889593 35.802399 4.0 27.0 42.0 50.0 216.0
18 1351.0 40.005922 38.480856 5.0 17.5 30.0 44.0 256.0
19 1084.0 21.105166 13.269840 4.0 11.0 19.0 27.0 139.0
20 1222.0 48.665303 50.111539 5.0 17.0 31.0 47.0 223.0
21 1029.0 24.627794 21.199819 4.0 10.0 22.0 31.0 190.0
22 1298.0 54.146379 38.743967 5.0 29.0 42.0 54.0 212.0
23 369.0 19.634146 18.382678 4.0 10.0 14.0 23.0 199.0
24 960.0 43.679167 45.936588 4.0 19.0 27.5 45.0 251.0
In [61]:
# Quickly creating summary counts 

# Count the number of samples by species
species_counts = surveys_df.groupby('species_id')['record_id'].count()
print(species_counts)
species_id
AB      303
AH      437
AS        2
BA       46
CB       50
CM       13
CQ       16
CS        1
CT        1
CU        1
CV        1
DM    10596
DO     3027
DS     2504
DX       40
NL     1252
OL     1006
OT     2249
OX       12
PB     2891
PC       39
PE     1299
PF     1597
PG        8
PH       32
PI        9
PL       36
PM      899
PP     3123
PU        5
PX        6
RF       75
RM     2609
RO        8
RX        2
SA       75
SC        1
SF       43
SH      147
SO       43
SS      248
ST        1
SU        5
UL        4
UP        8
UR       10
US        4
ZL        2
Name: record_id, dtype: int64
In [63]:
surveys_df.groupby('species_id')['record_id'].count()['DO'] # sample count for one particular species 
Out[63]:
3027
In [ ]:
# CHALLENGE : MAKE A LIST
# What’s another way to create a list of species and associated count of the records in the data? 

# Hint: you can perform count, min, etc. functions on groupby DataFrames in the same way you can perform them 
# on regular DataFrames.
In [66]:
surveys_df.head()
Out[66]:
record_id month day year plot_id species_id sex hindfoot_length weight
0 1 7 16 1977 2 NL M 32.0 NaN
1 2 7 16 1977 3 NL M 33.0 NaN
2 3 7 16 1977 2 DM F 37.0 NaN
3 4 7 16 1977 7 DM M 36.0 NaN
4 5 7 16 1977 3 DM M 35.0 NaN
In [77]:
grouped_data4 = surveys_df.groupby('species_id')
In [78]:
grouped_data4.count() # this works?  
Out[78]:
record_id month day year plot_id sex hindfoot_length weight
species_id
AB 303 303 303 303 303 0 0 0
AH 437 437 437 437 437 1 2 0
AS 2 2 2 2 2 0 0 0
BA 46 46 46 46 46 45 45 45
CB 50 50 50 50 50 0 0 0
CM 13 13 13 13 13 0 0 0
CQ 16 16 16 16 16 0 0 0
CS 1 1 1 1 1 0 0 0
CT 1 1 1 1 1 0 0 0
CU 1 1 1 1 1 0 0 0
CV 1 1 1 1 1 0 0 0
DM 10596 10596 10596 10596 10596 10523 9972 10262
DO 3027 3027 3027 3027 3027 3015 2887 2904
DS 2504 2504 2504 2504 2504 2458 2132 2344
DX 40 40 40 40 40 0 0 0
NL 1252 1252 1252 1252 1252 1177 1074 1152
OL 1006 1006 1006 1006 1006 987 920 970
OT 2249 2249 2249 2249 2249 2213 2139 2160
OX 12 12 12 12 12 9 8 6
PB 2891 2891 2891 2891 2891 2862 2864 2810
PC 39 39 39 39 39 0 0 0
PE 1299 1299 1299 1299 1299 1277 1212 1260
PF 1597 1597 1597 1597 1597 1573 1493 1548
PG 8 8 8 8 8 0 0 0
PH 32 32 32 32 32 32 31 31
PI 9 9 9 9 9 8 9 8
PL 36 36 36 36 36 35 36 36
PM 899 899 899 899 899 883 847 876
PP 3123 3123 3123 3123 3123 3076 3027 3024
PU 5 5 5 5 5 0 0 0
PX 6 6 6 6 6 2 2 2
RF 75 75 75 75 75 75 73 75
RM 2609 2609 2609 2609 2609 2548 2442 2535
RO 8 8 8 8 8 8 8 8
RX 2 2 2 2 2 2 2 2
SA 75 75 75 75 75 0 0 0
SC 1 1 1 1 1 0 0 0
SF 43 43 43 43 43 40 41 41
SH 147 147 147 147 147 143 131 141
SO 43 43 43 43 43 43 41 41
SS 248 248 248 248 248 3 0 2
ST 1 1 1 1 1 0 0 0
SU 5 5 5 5 5 0 0 0
UL 4 4 4 4 4 0 0 0
UP 8 8 8 8 8 0 0 0
UR 10 10 10 10 10 0 0 0
US 4 4 4 4 4 0 0 0
ZL 2 2 2 2 2 0 0 0
In [79]:
# Basic Math functions

# Multiply all weight values by 2
surveys_df['weight']*2
Out[79]:
0          NaN
1          NaN
2          NaN
3          NaN
4          NaN
         ...  
35544      NaN
35545      NaN
35546     28.0
35547    102.0
35548      NaN
Name: weight, Length: 35549, dtype: float64
In [80]:
# Plotting 

# Make sure figures appear inline in Ipython Notebook
%matplotlib inline
In [81]:
# Create a quick bar chart
species_counts.plot(kind='bar'); # what does the semi-colon do? 
In [82]:
total_count = surveys_df.groupby('plot_id')['record_id'].nunique()
In [83]:
# Let's plot that too
total_count.plot(kind='bar');
In [84]:
# CHALLENGE: PLOTS 
# 1. Create a plot of average weight across all species per site.
# 2. Create a plot of total males versus total females for the entire dataset.
In [85]:
site_weights = surveys_df.groupby('plot_id')['weight'] # group weights by site 
In [86]:
site_weights.head() 
Out[86]:
0         NaN
1         NaN
2         NaN
3         NaN
4         NaN
        ...  
1962    124.0
1969     41.0
2048     49.0
2054    119.0
2058     35.0
Name: weight, Length: 120, dtype: float64
In [88]:
site_mean_weights = site_weights.mean() # extract means 
In [89]:
site_mean_weights.head()
Out[89]:
plot_id
1    51.822911
2    52.251688
3    32.654386
4    47.928189
5    40.947802
Name: weight, dtype: float64
In [92]:
site_mean_weights.plot(kind = 'bar'); # plot mean weights of sites 
In [96]:
sex_counts = surveys_df.groupby('sex')['record_id'].count()
In [99]:
print(sex_counts)
sex
F    15690
M    17348
Name: record_id, dtype: int64
In [98]:
sex_counts.plot(kind = 'bar'); # plot number of records of each sex of entire data set 
In [ ]:
# SUMMARY PLOTTING CHALLENGE
# Create a stacked bar plot, with weight on the Y axis, and the stacked variable being sex. 
# The plot should show total weight by sex for each site. 
In [100]:
# Example data for making the plot 
d = {'one' : pd.Series([1., 2., 3.], index=['a', 'b', 'c']), 'two' : pd.Series([1., 2., 3., 4.], index=['a', 'b', 'c', 'd'])}
In [101]:
pd.DataFrame(d)
Out[101]:
one two
a 1.0 1.0
b 2.0 2.0
c 3.0 3.0
d NaN 4.0
In [103]:
# Plot stacked data so columns 'one' and 'two' are stacked
my_df = pd.DataFrame(d)
my_df.plot(kind='bar', stacked=True, title="Example graph")
Out[103]:
<matplotlib.axes._subplots.AxesSubplot at 0x123e94fd0>
In [104]:
# let's try to do this with weights by sites and sex 
site_sex = surveys_df.groupby(['plot_id', 'sex'])
In [105]:
site_sex_count = site_sex['weight'].sum()
In [106]:
print(site_sex_count)
plot_id  sex
1        F      38253.0
         M      59979.0
2        F      50144.0
         M      57250.0
3        F      27251.0
         M      28253.0
4        F      39796.0
         M      49377.0
5        F      21143.0
         M      23326.0
6        F      26210.0
         M      27245.0
7        F       6522.0
         M       6422.0
8        F      37274.0
         M      47755.0
9        F      44128.0
         M      48727.0
10       F       2359.0
         M       2776.0
11       F      34638.0
         M      43106.0
12       F      51825.0
         M      57420.0
13       F      24720.0
         M      30354.0
14       F      32770.0
         M      46469.0
15       F      12455.0
         M      11037.0
16       F       5446.0
         M       6310.0
17       F      42106.0
         M      48082.0
18       F      27353.0
         M      26433.0
19       F      11297.0
         M      11514.0
20       F      33206.0
         M      25988.0
21       F      15481.0
         M       9815.0
22       F      34656.0
         M      35363.0
23       F       3352.0
         M       3883.0
24       F      22951.0
         M      18835.0
Name: weight, dtype: float64
In [108]:
site_sex_count.unstack() 
# transforms grouped data into columns 
Out[108]:
sex F M
plot_id
1 38253.0 59979.0
2 50144.0 57250.0
3 27251.0 28253.0
4 39796.0 49377.0
5 21143.0 23326.0
6 26210.0 27245.0
7 6522.0 6422.0
8 37274.0 47755.0
9 44128.0 48727.0
10 2359.0 2776.0
11 34638.0 43106.0
12 51825.0 57420.0
13 24720.0 30354.0
14 32770.0 46469.0
15 12455.0 11037.0
16 5446.0 6310.0
17 42106.0 48082.0
18 27353.0 26433.0
19 11297.0 11514.0
20 33206.0 25988.0
21 15481.0 9815.0
22 34656.0 35363.0
23 3352.0 3883.0
24 22951.0 18835.0
In [111]:
site_sex_count.plot(kind='bar', stacked=True, title="Total weight by site and sex")
# so if we don't unstack first, it doesn't allow us to stack two variables within a site 
Out[111]:
<matplotlib.axes._subplots.AxesSubplot at 0x124028280>
In [112]:
to_plot = site_sex_count.unstack() 
In [113]:
to_plot.plot(kind='bar', stacked=True, title="Total weight by site and sex") # yay 
Out[113]:
<matplotlib.axes._subplots.AxesSubplot at 0x13050d370>