%cd /Users/alisonfowler/Docs/Jupityr-notebooks/

/Users/alisonfowler/Docs/Jupityr-notebooks

import pandas as pd

surveys_df = pd.read_csv("data/surveys.csv")

type(surveys_df)

pandas.core.frame.DataFrame

surveys_df['sex'].dtype
# O stands for Object aka string

dtype('O')

surveys_df['record_id'].dtype
# 64 bit integer

dtype('int64')

surveys_df.dtypes 
# displays all data types in the file

record_id            int64
month                int64
day                  int64
year                 int64
plot_id              int64
species_id          object
sex                 object
hindfoot_length    float64
weight             float64
dtype: object

print(5+5)

10

print(24-4)

20

print(5/9)

0.5555555555555556

print(10/3)

3.3333333333333335

# Convert a to an integer
a = 7.83
int(a)

7

# Convert b to a float
b = 7
float(b)

7.0

# Convert the record_id field from an integer to a float
surveys_df['record_id'] = surveys_df['record_id'].astype('float64')
surveys_df['record_id'].dtype

dtype('float64')

surveys_df.plot_id.astype("float")

0         2.0
1         3.0
2         2.0
3         7.0
4         3.0
         ... 
35544    15.0
35545    15.0
35546    10.0
35547     7.0
35548     5.0
Name: plot_id, Length: 35549, dtype: float64

# Next try converting weight to an integer. What goes wrong here? What is Pandas telling you? 
surveys_df.weight.astype("int")

# Error: there are NaNs that it doesn't know what to do with

---------------------------------------------------------------------------
ValueError                                Traceback (most recent call last)
<ipython-input-19-127bf4231017> in <module>
      1 # Next try converting weight to an integer. What goes wrong here? What is Pandas telling you?
----> 2 surveys_df.weight.astype("int")

~/opt/anaconda3/lib/python3.8/site-packages/pandas/core/generic.py in astype(self, dtype, copy, errors)
   5696         else:
   5697             # else, only a single dtype is given
-> 5698             new_data = self._data.astype(dtype=dtype, copy=copy, errors=errors)
   5699             return self._constructor(new_data).__finalize__(self)
   5700 

~/opt/anaconda3/lib/python3.8/site-packages/pandas/core/internals/managers.py in astype(self, dtype, copy, errors)
    580 
    581     def astype(self, dtype, copy: bool = False, errors: str = "raise"):
--> 582         return self.apply("astype", dtype=dtype, copy=copy, errors=errors)
    583 
    584     def convert(self, **kwargs):

~/opt/anaconda3/lib/python3.8/site-packages/pandas/core/internals/managers.py in apply(self, f, filter, **kwargs)
    440                 applied = b.apply(f, **kwargs)
    441             else:
--> 442                 applied = getattr(b, f)(**kwargs)
    443             result_blocks = _extend_blocks(applied, result_blocks)
    444 

~/opt/anaconda3/lib/python3.8/site-packages/pandas/core/internals/blocks.py in astype(self, dtype, copy, errors)
    623             vals1d = values.ravel()
    624             try:
--> 625                 values = astype_nansafe(vals1d, dtype, copy=True)
    626             except (ValueError, TypeError):
    627                 # e.g. astype_nansafe can fail on object-dtype of strings

~/opt/anaconda3/lib/python3.8/site-packages/pandas/core/dtypes/cast.py in astype_nansafe(arr, dtype, copy, skipna)
    866 
    867         if not np.isfinite(arr).all():
--> 868             raise ValueError("Cannot convert non-finite values (NA or inf) to integer")
    869 
    870     elif is_object_dtype(arr):

ValueError: Cannot convert non-finite values (NA or inf) to integer

len(surveys_df[pd.isnull(surveys_df.weight)]) # number of NaNs

3266

len(surveys_df[surveys_df.weight > 0]) # number of rows that have a real value for a weight

32283

df1 = surveys_df.copy()
# Fill all NaN values with 0
df1['weight'] = df1['weight'].fillna(0)
# ... but we don't actually want to do this.

# Replace with mean
df1['weight'] = surveys_df['weight'].fillna(surveys_df['weight'].mean())

# How many missing values are in the other columns? 

# record_id            
len(surveys_df[pd.isnull(surveys_df.record_id)])

0

# month 
len(surveys_df[pd.isnull(surveys_df.month)])

0

# day
len(surveys_df[pd.isnull(surveys_df.day)])

0

# year 
len(surveys_df[pd.isnull(surveys_df.year)])

0

# plot_id
len(surveys_df[pd.isnull(surveys_df.plot_id)])

0

# species_id
len(surveys_df[pd.isnull(surveys_df.species_id)])

763

# sex 
len(surveys_df[pd.isnull(surveys_df.sex)])

2511

# hindfoot length
len(surveys_df[pd.isnull(surveys_df.hindfoot_length)])

4111

# reload the data so we don't get confused if we've made changes 
surveys_df = pd.read_csv("data/surveys.csv")

# drop NaNs: dropna removes rows that contain missing data for even just one column.
df_na = surveys_df.dropna()

df_na

# Write new DataFrame to CSV
df_na.to_csv('data/surveys_complete.csv', index=False)

	record_id	month	day	year	plot_id	species_id	sex	hindfoot_length	weight
62	63	8	19	1977	3	DM	M	35.0	40.0
63	64	8	19	1977	7	DM	M	37.0	48.0
64	65	8	19	1977	4	DM	F	34.0	29.0
65	66	8	19	1977	4	DM	F	35.0	46.0
66	67	8	19	1977	7	DM	M	35.0	36.0
...	...	...	...	...	...	...	...	...	...
35540	35541	12	31	2002	15	PB	F	24.0	31.0
35541	35542	12	31	2002	15	PB	F	26.0	29.0
35542	35543	12	31	2002	15	PB	F	27.0	34.0
35546	35547	12	31	2002	10	RM	F	15.0	14.0
35547	35548	12	31	2002	7	DO	M	36.0	51.0