# coding: utf-8 # # Overview of Pandas and Xarray groupby(), resample() # In[1]: import numpy as np import pandas as pd import xarray as xr from matplotlib import pyplot as plt get_ipython().run_line_magic('matplotlib', 'inline') # ## Get some data : tornado reports # ## read the csv file - check out the options # In[2]: d = pd.read_csv('1950-2016_all_tornadoes.csv', delimiter=',', header=0) d.head() # In[3]: type(d) # In[4]: get_ipython().run_line_magic('pinfo', 'pd.read_csv') # ### I can tell read_csv to parse some columns and create a time index # In[5]: d = pd.read_csv('1950-2016_all_tornadoes.csv', delimiter=',', header=0, error_bad_lines=False, parse_dates=[[1,2,3,5]], keep_date_col=True) d.head() # ### Useful commands to always run to check on how the data were loaded # In[6]: d.info() # #### some of the variables were loaded as objects although - except for the state name - they are all numbers. # #### quick way to convert them is the following ( I will get an error because indeed the state names can't be converted) # In[7]: d = d.convert_objects(convert_numeric=True) d.info() # #### .describe() gives an overview of the data # #### look at EFscale # In[8]: d.describe() # #### Accessing columns/rows can be simple or tricky depending on how complicated it is. # #### In general, columns can be accessed "attribute style" or through the key names # Attribute style # In[9]: d.EFscale.head() # keys # In[10]: d.keys() # In[11]: d['EFscale'].head() # #### let's tell python to exclude those negative EFscale # In[12]: d = d[d.EFscale>-1] d.describe() # In[13]: d= d[d['sg']==1] d.startlat.describe() # ### let's rename the date column # In[14]: d = d.rename(index=str, columns={"yr_mo_dy_time": "date_time"}) d.head() # ### let's use the date_time column as an index # In[15]: d.set_index(keys='date_time', inplace=True) d.head() # In[16]: d['1960-01-01':'1960-02-01'] # ## let's look at groupby # In[17]: get_ipython().run_line_magic('pinfo', 'd.groupby') # ## simple examples # ### groupby one of the column # In[18]: d.groupby(by='EFscale').count() # #### note how all the columns reports the same value; we are counting the number of elements for each EFscale, so that makes sense # In[19]: d.groupby('EFscale').mean() # #### in this case each column does the average of its own values, so they are different. Makes sense. # In[20]: d.groupby('EFscale').max().fatalities # In[21]: d[d['fatalities']==158] # #### Because our index is a DatetimeIndex (and there are ways to transform it into that type if it doesn't happen magically when you load the file) we can use some attributes that are always available for this class # In[22]: type(d.index) # In[23]: d.index # In[24]: d.index.weekday # In[25]: d.index.hour # In[26]: d.index.year # ### let's see what is the number of tornado per each weekday # In[27]: fig, ax = plt.subplots() d.groupby(d.index.weekday)['EFscale'].count().plot(kind='bar', ax=ax) # ### what about the hour of the day # In[28]: fig, ax = plt.subplots() d.groupby(d.index.hour)['EFscale'].count().plot(kind='bar', ax=ax) # ### what is the average intensity per hour of the day # In[29]: fig, ax = plt.subplots() d.groupby(d.index.hour)['EFscale'].mean().plot(kind='bar', ax=ax) # ### how about the interquartile range # In[30]: fig, ax = plt.subplots() d.groupby(d.index.hour)['EFscale'].max().plot(kind='line', ax=ax) d.groupby(d.index.hour)['EFscale'].min().plot(kind='line', ax=ax) d.groupby(d.index.hour)['EFscale'].quantile(0.25, interpolation='midpoint').plot(kind='line', ax=ax) d.groupby(d.index.hour)['EFscale'].quantile(0.5, interpolation='midpoint').plot(kind='line', ax=ax) d.groupby(d.index.hour)['EFscale'].quantile(0.75, interpolation='midpoint').plot(kind='line', ax=ax) # ### length of path? # In[31]: fig, ax = plt.subplots() # d.groupby(d.index.hour)['widthyards'].max().plot(kind='line', ax=ax) # d.groupby(d.index.hour)['widthyards'].min().plot(kind='line', ax=ax) d.groupby(d.index.hour)['lenghtmiles'].quantile(0.25, interpolation='midpoint').plot(kind='line', ax=ax, label='25%') d.groupby(d.index.hour)['lenghtmiles'].quantile(0.25).plot(kind='line', ax=ax, label='25%') d.groupby(d.index.hour)['lenghtmiles'].quantile(0.5, interpolation='midpoint').plot(kind='line', ax=ax, label='50%') d.groupby(d.index.hour)['lenghtmiles'].quantile(0.75, interpolation='midpoint').plot(kind='line', ax=ax, label='75%') plt.legend() # In[32]: d.groupby(['EFscale'])['lenghtmiles'].quantile(0.5).plot(kind='line',marker='o') d.groupby(['EFscale'])['lenghtmiles'].quantile(0.1).plot(kind='line',marker='o') d.groupby(['EFscale'])['lenghtmiles'].quantile(0.9).plot(kind='line',marker='o') # In[33]: tempd = d.groupby(['EFscale'])['lenghtmiles'] print(type(tempd)) # In[34]: tempd.quantile() # In[35]: fig, ax = plt.subplots() d.boxplot(by=['EFscale'], column='lenghtmiles', ax=ax, ) # ### What if we want to groupby by more than one column? # In[36]: d.groupby(by=['EFscale',d.index.hour])['om'].count().unstack() # In[37]: dd1 = d.groupby(by=[d.index.hour,'EFscale'])['om'].count()# dd1.head(10) # In[38]: dd1.tail() # In[39]: dd2 = dd1.unstack() dd2 # In[40]: dd2.plot(kind='bar',stacked=True) # In[41]: dd3 = dd2.mul(1./d.groupby(by=[d.index.hour])['om'].count(), axis=0) dd3.head() # In[42]: dd3.plot(kind='bar', stacked=True) # ### and once again the same analysis but for month of the year # In[43]: fig, ax = plt.subplots() d.groupby(d.index.month)['EFscale'].count().plot(kind='bar', ax=ax) # In[44]: fig, ax = plt.subplots(figsize=(18,3)) d.groupby(d.index.year)['EFscale'].count().plot(kind='bar', ax=ax) # # What else can i do on these groups? # In[45]: groupbyobject = d.groupby('EFscale') # In[46]: groupbyobject # # Iterate on groups # In[47]: # Group the dataframe by regiment, and for each regiment, for name, group in d.groupby('EFscale'): # print the name of the regiment print(name) # print the data of that regiment print(group.head()) # In[48]: for name, group in d.groupby('EFscale'): # print the name of the regiment # print the data of that regiment group.plot.hexbin('startlon', 'startlat', cmap='viridis') # # Use your own function with .apply() # In[49]: def get_stats(group): return {'min': group.min(), 'max': group.max(), 'count': group.count(), 'mean': group.mean()} # In[50]: d['lenghtmiles'].groupby(d['EFscale']).apply(get_stats).unstack() # # .resample() # ### what if I want the time series on a different time resolution, 1 month? 1 hour? # In[51]: get_ipython().run_line_magic('pinfo', 'd.resample') # In[52]: fig, ax = plt.subplots(figsize=(20,3)) d.resample('1m')['om'].count().plot() d.resample('1M')['om'].count().plot() # In[53]: d.resample('1MS')['om'].count().head() # In[54]: d.resample('1M')['om'].count().head() # In[55]: fig, ax = plt.subplots(figsize=(20,3)) d.resample('1QS')['om'].count().plot() # In[56]: d.resample('1QS')['om'].mean().head() # In[57]: d.resample('1QS-DEC')['om'].mean().head() # # Xarray # In[58]: step = 1. to_bin = lambda x: np.round(x / step) * step d["lat1"] = d.startlat.map(to_bin) d["lon1"] = d.startlon.map(to_bin) # In[59]: def to_bin2(x): return np.round(x / step) * step d["lat2"] = d.startlat.map(to_bin2) d["lon2"] = d.startlon.map(to_bin2) # In[60]: TD = d[(d['yr']>=2010)&(d['yr']<=2012)].groupby( ("lat1", "lon1",pd.Grouper(freq='3H'))).endlon.count() # In[61]: TD.head() # In[62]: type(TD) # In[63]: TD = pd.DataFrame(TD) # In[64]: dsT = xr.Dataset(TD) # In[65]: dsT # In[ ]: dsT = dsT.unstack('dim_0') dsT # In[ ]: year = dsT['date_time.year'] year # In[ ]: dsT1M = dsT.resample(date_time='1MS').sum() # In[ ]: dsT1M # In[ ]: dsT1M.sel(lat1=slice(25,50),lon1=slice(-125,-60), date_time=slice('2010-01-01','2010-01-31')).endlon.plot() # In[ ]: dsT1M.sel(lat1=slice(25,50),lon1=slice(-125,-60)).endlon[0].plot() # ### rolling() # In[ ]: dsT3r = dsT1M.rolling(date_time=3,).sum() # In[1]: dsT3r # In[ ]: dsT3r.sel(lat1=slice(25,50),lon1=slice(-125,-60)).endlon[0].plot() # In[ ]: dsT3r.sel(lat1=slice(25,50),lon1=slice(-125,-60)).endlon[3].plot()