# coding: utf-8 # # Overview of Pandas and Xarray groupby(), resample() # ### let's load all the libraries we need. # In[33]: import numpy as np import pandas as pd import xarray as xr from matplotlib import pyplot as plt get_ipython().run_line_magic('matplotlib', 'inline') # ## Get some data : tornado reports # ## read the csv file - check out the options # In[34]: get_ipython().system(' cp /home/pangeo/Advanced-notebooks/1950-2016_all_tornadoes.csv .') d = pd.read_csv('1950-2016_all_tornadoes.csv', delimiter=',', header=0) d.head() # what object do I have now? always check the type # In[35]: type(d) # In[36]: get_ipython().run_line_magic('pinfo', 'pd.read_csv') # ### I can tell read_csv to parse some columns and create a time index # In[37]: d = pd.read_csv('1950-2016_all_tornadoes.csv', delimiter=',', header=0, error_bad_lines=False, parse_dates=[[1,2,3,5]], keep_date_col=True) d.head() # In[38]: d.keys() # for simplicity I am "dropping" some columns I won't need (check what inplace=True does in the options) # In[39]: d.drop(columns=['timezone','stateFIPS', 'StateNumber','croploss','ns', 'sn','fips1', 'fips2', 'fips3', 'fips4','fc'], inplace=True) d.head() # In[40]: d.shape # ### Useful commands to always run to check on how the data were loaded # In[41]: d.info() # #### some of the variables were loaded as objects although - except for a few - they are all numbers. # #### quick way to convert them is the following # In[44]: d = d.convert_objects(convert_numeric=True) d.info() # #### .describe() gives an overview of the data # look at EFscale (tornado intensity), there are some negative intensity values, I will want to exclude those values # In[45]: d.describe() # #### Accessing columns/rows can be simple or tricky depending on how complicated it is. # #### In general, columns can be accessed "attribute style" or through the key names # Attribute style , with the "dot" and the name = .EFscale # In[46]: d.EFscale.head() # using the keys, which are essentialy the labels of the columns (depending on the method/function you are using, labels, columns, keys are often referred to the column name, always read the help) # In[47]: d.keys() # In[48]: d['EFscale'].head() # let's tell python to exclude those negative EFscale - note, I use the attribute style access of column EFscale # In[49]: d = d[d.EFscale>-1] d.describe() # the column sg identifies full track tornadoes, for my analysis I want only those, so once again I will exclude the other values # # note I use the keys access to get my column sg # In[50]: d['sg']==1 # In[51]: d = d[ d['sg']==1] d.startlat.describe() # ### let's rename the column with the parsed dates # In[52]: d = d.rename(index=str, columns={"yr_mo_dy_time": "date_time"}) d.head() # ### let's set the date_time column as an index # # note - when you transform a column in an index, that column is lost from the dataframe # In[53]: d.set_index(keys='date_time', inplace=True) d.head() # # Q. does index need to be unique? # # short answer, no. They are "tolerated" but I think they have some limits. # # let's first "reset_index" because I don't want to loose my date_time column # In[57]: d.reset_index(inplace=True) d.head() # In[58]: d.set_index(keys='EFscale', inplace=True) # In[59]: d.head() # more info here # https://stackoverflow.com/questions/16626058/what-is-the-performance-impact-of-non-unique-indexes-in-pandas # let's go back to the dataframe with date_time as index # In[60]: d.reset_index(inplace=True) d.set_index(keys='date_time', inplace=True) d.head() # ### this allows us to access the data based on the date value # In[61]: d['1960-01-01':'1960-02-01'] # ## let's look at groupby # In[62]: get_ipython().run_line_magic('pinfo', 'd.groupby') # ## simple examples # ### groupby one of the column # # what object does groupby create? # In[63]: d.groupby(by='EFscale') # In[64]: tempdgroupby = d.groupby(by='EFscale') tempdgroupby # check all methods and attributes of this object by using tab after the "dot" # In[65]: tempdgroupby. # In[66]: print(tempdgroupby.yr.count()) print(tempdgroupby['yr'].count()) # they are the same thing, just accessed differently # #### Let's use count. Note how all the columns reports the same value; we are counting the number of elements for each EFscale, so that makes sense # In[67]: d.groupby(by='EFscale').count() # In[68]: d['EFscale'].unique() # #### Let's use mean, in this case each column does the average of its own values, so they are different. Makes sense. # In[69]: d.groupby('EFscale').mean() # In[70]: d.groupby('EFscale')['fatalities'].max() # In[ ]: d[d['fatalities']==158] # #### Because our index is a DatetimeIndex (and there are ways to transform it into that type if it doesn't happen magically when you load the file) we can use some attributes that are always available for this class # In[71]: type(d.index) # In[72]: d.index # In[73]: d.index.weekday # In[74]: d.index.hour # In[75]: d.index.day # In[76]: d.index.month # In[77]: d.index.year # ### let's see what is the number of tornado per each weekday # In[78]: fig, ax = plt.subplots() d.groupby(d.index.weekday)['EFscale'].count().plot(kind='bar', ax=ax, ylim=6000) # what I did here? I # # generated a groupby object -> d.groupby(d.index.weekday) # # extracted a column from it -> d.groupby(d.index.weekday)['EFscale'] # # applied a method to it -> d.groupby(d.index.weekday)['EFscale'].count() # # and then applied a function-> d.groupby(d.index.weekday)['EFscale'].count().plot(kind='bar', ax=ax, ylim=6000) # # I could have split the line in various steps. I could have also plotted the results with matplotlib syntax. # Overall, the function .plot() is more limited in options than the full matplotlib library. # some over the top customization below... # In[79]: import calendar fig, ax = plt.subplots() tempd = d.groupby(d.index.weekday)['EFscale'].count() ax.plot(tempd, 'o-', color='cornflowerblue', label='tornadoes occurrences per weekeday') ax.set_xticks(np.arange(0,7)) ax.set_xticklabels(list(calendar.day_name), rotation=90) ax.set_ylim(bottom=6000) ax.grid(axis='x') ax.legend(bbox_to_anchor=[1,1]) # ### what about the hour of the day # In[80]: fig, ax = plt.subplots() d.groupby(d.index.hour)['EFscale'].count().plot(kind='bar', ax=ax) # ### what is the average intensity per hour of the day # In[81]: fig, ax = plt.subplots() groupbyhour = d.groupby(d.index.hour) getEFscaleoutofthegroup = groupbyhour['EFscale'] calculatethemean = getEFscaleoutofthegroup.mean() #plot calculatethemean.plot(kind='bar', ax=ax) # ### how about other functions besides .mean() .count() # In[82]: tempgroupby = d.groupby('EFscale') # In[85]: tempgroupby.get_group(4).fatalities.plot() tempgroupby.get_group(5).fatalities.plot() # ### quantile of length of path # In[86]: fig, ax = plt.subplots() # d.groupby(d.index.hour)['widthyards'].max().plot(kind='line', ax=ax) # d.groupby(d.index.hour)['widthyards'].min().plot(kind='line', ax=ax) d.groupby(d.index.hour)['lenghtmiles'].quantile(0.25).plot(kind='line', ax=ax, label='25%') d.groupby(d.index.hour)['lenghtmiles'].quantile(0.5).plot(kind='line', ax=ax, label='50%') d.groupby(d.index.hour)['lenghtmiles'].quantile(0.75).plot(kind='line', ax=ax, label='75%') plt.legend() # let's look at the length of tornado path as a function of EFscale # In[87]: d.groupby(['EFscale'])['lenghtmiles'].quantile(0.5).plot(kind='line',marker='o') d.groupby(['EFscale'])['lenghtmiles'].quantile(0.1).plot(kind='line',marker='o') d.groupby(['EFscale'])['lenghtmiles'].quantile(0.9).plot(kind='line',marker='o') # ### sometimes the groupby functionalities are available within other functions, usually under the "by=" parameter # In[88]: fig, ax = plt.subplots() d.boxplot(by=['EFscale'], column='lenghtmiles', ax=ax, ) # ### What if we want to groupby by more than one column? # Note how nicely the print to screen let's you highlight the row as your hover over. # In[89]: d.groupby(by=['EFscale',d.index.hour]).count().head(15) # ### order does matter in the grouping # In[90]: d.groupby(by=[d.index.hour,'EFscale']).count().head(15) # let's extract one column of this dataframe - since I am using .count() all columns are identical, i'll pick om. # In[91]: dd1 = d.groupby(by=[d.index.hour,'EFscale'])['om'].count()# dd1.head(10) # In[92]: dd1.tail() # ### .unstack() for multiple dimensions indexes # # ### unstack Pivots a level of the (necessarily hierarchical) index labels, read the help online # In[93]: dd2 = dd1.unstack() dd2.head(15) # now I can do some nice plotting # In[94]: dd2.plot(kind='bar',stacked=True) # ### let's normalize between 0 and 1 # In[95]: dd3 = dd2.mul(1./d.groupby(by=[d.index.hour])['om'].count(), axis=0) dd3.head() # In[96]: dd3.plot(kind='bar', stacked=True) # ### and once again the same analysis but for month of the year # In[97]: fig, ax = plt.subplots() d.groupby(d.index.month)['EFscale'].count().plot(kind='line', ax=ax, marker='o') # In[98]: fig, ax = plt.subplots(figsize=(18,3)) d.groupby(d.index.year)['EFscale'].count().plot(kind='line', ax=ax, marker='o') # ### once again, you can also create an object and use matplotlib syntax to plot it. # In[100]: tempd = d.groupby(d.index.year)['EFscale'].count() fig, ax = plt.subplots(figsize=(18,3)) ax.plot(tempd,'o-') # In[ ]: tempd.head() # # What else can i do on these groups? # In[ ]: groupbyobject = d.groupby('EFscale') # # Iterate on groups # # python knows how to handle the group within a for loop, and understands what to iterate within. # I have my "groupbyobject" and the first quantity is name (the name of the group, based on the category of your grouping rule) and then the actual group of values. # In[ ]: for name, group in groupbyobject: # print the name of the group print(name) # print the data of that group print(group.head(3)) # In[ ]: for name, group in groupbyobject: # let's plot the values group.plot.hexbin('startlon', 'startlat', cmap='viridis') # # Use your own function with .apply() # In[ ]: def get_stats(group): return {'min': group.min(), 'max': group.max(), 'count': group.count(), 'mean': group.mean()} # In[ ]: d['lenghtmiles'].groupby(d['EFscale']).apply(get_stats).unstack() # # .resample() # ### what if I want the time series on a different time resolution, 1 month? 1 hour? # In[ ]: get_ipython().run_line_magic('pinfo', 'd.resample') # In[ ]: fig, ax = plt.subplots(figsize=(20,3)) d.resample('1m')['om'].count().plot() d.resample('1M')['om'].count().plot() # check the options for .resample. MS is for month start, and it automatically use the first day of the month. # In[ ]: d.resample('1MS')['om'].count().head() # M is for month, and it uses the end. # In[ ]: d.resample('1M')['om'].count().head() # the two series seem to be different only in the index # In[ ]: np.sum(d.resample('1MS')['om'].count().reset_index()-d.resample('1M')['om'].count().reset_index())['om'] # In[ ]: np.sum(d.resample('1MS')['om'].mean().reset_index()-d.resample('1M')['om'].mean().reset_index())['om'] # However, it's always important to check "by hand" what you are actually doing, to make sure it is exactly what you want. # # Q is for quaterly (three months), but Q uses the end of the month, QS the start of the month. And then you can indicate the starting month (for example for a DJF MAM JJA SON quaterly resampling you want QS-DEC) # In[ ]: fig, ax = plt.subplots(figsize=(20,3)) d.resample('1QS')['om'].count().plot() # please note the different index and in this case, the values don't change because a day of difference doesn't change things # In[ ]: d.resample('1QS')['EFscale'].mean().head() # In[ ]: d.resample('1Q')['EFscale'].mean().head() # Starting from december adds an extra year/month preceeding your database (my database start in Jan 1950). But only for the indexing. it does not fill it with values or the average, so you want to discard that # In[ ]: d.resample('1QS-DEC')['om'].count().head() # let's check - actually note that in this case the interval is inclusive of the last value :) # In[ ]: d['1950-03-01':'1950-05-31'].om # In[ ]: d['1950-03-01':'1950-05-31']['om'].count() # In[ ]: d['1949-12-01':'1950-02-28']['om'].count() # In[ ]: d.resample('1Q-DEC')['om'].count().head() # In[ ]: d['1949-12-31':'1950-03-31']['om'].count() # In[ ]: d['1950-03-31':'1950-06-30']['om'].count() # so to avoid misunderstanding, always check! # ### let's get daily values # my first tornado is on january 3rd, so the time index will start on that dayte # In[ ]: ddaily = d.resample('1D').count() print(ddaily.om.head()) print(ddaily.om[0:45].count()) # In[ ]: ddaily.head(15) # # .rolling() # ## allows you to do a rolling mean. however you specify the window of your moving operation. Therefore your data needs to be already on some type of gridded time frame (or maybe not! it depends on your data of course). # # in my case I want a 3day rolling sum, therefore I first transform my data in daily valyes, and then I apply .rolling with 3 as a window. # In[ ]: ddaily.rolling(3).om.sum().head(5) # ### what about an odd time window -> pd.Grouper (which is what resample does under the hood) # In[ ]: ddaily.groupby(pd.Grouper(freq='45D'))['om'].sum() # # Useful list of commands # # https://www.dataquest.io/blog/pandas-cheat-sheet/