In [1]:
%cd /Users/alisonfowler/Docs/Jupityr-notebooks/
/Users/alisonfowler/Docs/Jupityr-notebooks
In [2]:
import pandas as pd
In [3]:
surveys_df = pd.read_csv("data/surveys.csv")
In [4]:
type(surveys_df)
Out[4]:
pandas.core.frame.DataFrame
In [5]:
surveys_df['sex'].dtype
# O stands for Object aka string 
Out[5]:
dtype('O')
In [7]:
surveys_df['record_id'].dtype
# 64 bit integer
Out[7]:
dtype('int64')
In [8]:
surveys_df.dtypes 
# displays all data types in the file
Out[8]:
record_id            int64
month                int64
day                  int64
year                 int64
plot_id              int64
species_id          object
sex                 object
hindfoot_length    float64
weight             float64
dtype: object
In [9]:
print(5+5)
10
In [10]:
print(24-4)
20
In [11]:
print(5/9)
0.5555555555555556
In [12]:
print(10/3)
3.3333333333333335
In [13]:
# Convert a to an integer
a = 7.83
int(a)
Out[13]:
7
In [14]:
# Convert b to a float
b = 7
float(b)
Out[14]:
7.0
In [15]:
# Convert the record_id field from an integer to a float
surveys_df['record_id'] = surveys_df['record_id'].astype('float64')
surveys_df['record_id'].dtype
Out[15]:
dtype('float64')
In [16]:
surveys_df.plot_id.astype("float")
Out[16]:
0         2.0
1         3.0
2         2.0
3         7.0
4         3.0
         ... 
35544    15.0
35545    15.0
35546    10.0
35547     7.0
35548     5.0
Name: plot_id, Length: 35549, dtype: float64
In [19]:
# Next try converting weight to an integer. What goes wrong here? What is Pandas telling you? 
surveys_df.weight.astype("int")

# Error: there are NaNs that it doesn't know what to do with 
---------------------------------------------------------------------------
ValueError                                Traceback (most recent call last)
<ipython-input-19-127bf4231017> in <module>
      1 # Next try converting weight to an integer. What goes wrong here? What is Pandas telling you?
----> 2 surveys_df.weight.astype("int")

~/opt/anaconda3/lib/python3.8/site-packages/pandas/core/generic.py in astype(self, dtype, copy, errors)
   5696         else:
   5697             # else, only a single dtype is given
-> 5698             new_data = self._data.astype(dtype=dtype, copy=copy, errors=errors)
   5699             return self._constructor(new_data).__finalize__(self)
   5700 

~/opt/anaconda3/lib/python3.8/site-packages/pandas/core/internals/managers.py in astype(self, dtype, copy, errors)
    580 
    581     def astype(self, dtype, copy: bool = False, errors: str = "raise"):
--> 582         return self.apply("astype", dtype=dtype, copy=copy, errors=errors)
    583 
    584     def convert(self, **kwargs):

~/opt/anaconda3/lib/python3.8/site-packages/pandas/core/internals/managers.py in apply(self, f, filter, **kwargs)
    440                 applied = b.apply(f, **kwargs)
    441             else:
--> 442                 applied = getattr(b, f)(**kwargs)
    443             result_blocks = _extend_blocks(applied, result_blocks)
    444 

~/opt/anaconda3/lib/python3.8/site-packages/pandas/core/internals/blocks.py in astype(self, dtype, copy, errors)
    623             vals1d = values.ravel()
    624             try:
--> 625                 values = astype_nansafe(vals1d, dtype, copy=True)
    626             except (ValueError, TypeError):
    627                 # e.g. astype_nansafe can fail on object-dtype of strings

~/opt/anaconda3/lib/python3.8/site-packages/pandas/core/dtypes/cast.py in astype_nansafe(arr, dtype, copy, skipna)
    866 
    867         if not np.isfinite(arr).all():
--> 868             raise ValueError("Cannot convert non-finite values (NA or inf) to integer")
    869 
    870     elif is_object_dtype(arr):

ValueError: Cannot convert non-finite values (NA or inf) to integer
In [23]:
len(surveys_df[pd.isnull(surveys_df.weight)]) # number of NaNs
Out[23]:
3266
In [24]:
len(surveys_df[surveys_df.weight > 0]) # number of rows that have a real value for a weight
Out[24]:
32283
In [25]:
df1 = surveys_df.copy()
# Fill all NaN values with 0
df1['weight'] = df1['weight'].fillna(0)
# ... but we don't actually want to do this. 
In [26]:
# Replace with mean
df1['weight'] = surveys_df['weight'].fillna(surveys_df['weight'].mean())
In [27]:
# How many missing values are in the other columns? 

# record_id            
len(surveys_df[pd.isnull(surveys_df.record_id)])
Out[27]:
0
In [28]:
# month 
len(surveys_df[pd.isnull(surveys_df.month)])
Out[28]:
0
In [29]:
# day
len(surveys_df[pd.isnull(surveys_df.day)])
Out[29]:
0
In [30]:
# year 
len(surveys_df[pd.isnull(surveys_df.year)])
Out[30]:
0
In [31]:
# plot_id
len(surveys_df[pd.isnull(surveys_df.plot_id)])
Out[31]:
0
In [32]:
# species_id
len(surveys_df[pd.isnull(surveys_df.species_id)])
Out[32]:
763
In [33]:
# sex 
len(surveys_df[pd.isnull(surveys_df.sex)])
Out[33]:
2511
In [34]:
# hindfoot length
len(surveys_df[pd.isnull(surveys_df.hindfoot_length)])
Out[34]:
4111
In [35]:
# reload the data so we don't get confused if we've made changes 
surveys_df = pd.read_csv("data/surveys.csv")
In [36]:
# drop NaNs: dropna removes rows that contain missing data for even just one column.
df_na = surveys_df.dropna()
In [37]:
df_na
Out[37]:
record_id month day year plot_id species_id sex hindfoot_length weight
62 63 8 19 1977 3 DM M 35.0 40.0
63 64 8 19 1977 7 DM M 37.0 48.0
64 65 8 19 1977 4 DM F 34.0 29.0
65 66 8 19 1977 4 DM F 35.0 46.0
66 67 8 19 1977 7 DM M 35.0 36.0
... ... ... ... ... ... ... ... ... ...
35540 35541 12 31 2002 15 PB F 24.0 31.0
35541 35542 12 31 2002 15 PB F 26.0 29.0
35542 35543 12 31 2002 15 PB F 27.0 34.0
35546 35547 12 31 2002 10 RM F 15.0 14.0
35547 35548 12 31 2002 7 DO M 36.0 51.0

30676 rows × 9 columns

In [39]:
# Write new DataFrame to CSV
df_na.to_csv('data/surveys_complete.csv', index=False)