# coding: utf-8 # # Overview of Xarray applied to data # # as always load all my libraries # In[ ]: import numpy as np import pandas as pd import xarray as xr from matplotlib import pyplot as plt get_ipython().run_line_magic('matplotlib', 'inline') # ### let's get some data - I will load the same dataset as Naomi's lecture # In[ ]: ds = xr.open_dataset('/home/pangeo/data/NOAA_NCDC_ERSST_v3b_SST.nc/') # let's look at what is into it # In[ ]: ds # How do I access the variable sst? As in Pandas, there are 2 ways, keys and attribute style # In[ ]: ds.sst # In[ ]: ds['sst'] # other attributes that allow me to browse my data # In[ ]: ds.coords # In[ ]: ds.attrs # ### like in Pandas I have methods built in. # ### Right now tho we have a DataSet that is comprised of coordinates and variables (which are DataArray) # # In[ ]: ds.mean() # In[ ]: ds.sst.mean() # ### but # In[ ]: ds.sst.mean(dim='time').plot() # In[ ]: ds.mean(dim='time').plot() # ## !!!! Always read the error - it does have suggestions of where the problem is !!!! # The error is telling me that the dataset object has not attribute plot, because I have to add the dataarray name to it # In[ ]: ds.mean(dim='time').sst.plot() # ## Going back to my data # # #### mean along one dimension? # In[ ]: ds.mean(dim='time').sst.plot() # Along multiple dimensions, put the dimensions in a square brackets # In[ ]: ds.mean(dim=['lon','lat']).sst.plot() # you can mix them as you wish # In[ ]: ds.mean(dim=['time','lat']).sst.plot() # In[ ]: ds.mean(dim=['time','lon']).sst.plot() # # How about selecting part of the data? # ### sel() where() # sel() select part of the data, the dimensions are reduced to the selected area # In[ ]: ds.sel(lat=38).mean(dim='time').sst.plot() # In[ ]: ds.sel(lat=38).mean(dim='time').sst.shape # where() masks the data, so the size of the dataset/dataarray but the operations are performed on the not masked data. # In[ ]: ds.where(ds.lat==38).mean(dim='time').sst.plot() # In[ ]: ds.where(ds.lat==38).mean(dim='time').sst.shape # ## resample() # # used to be like the following sintax, where you defined everything within resample(), the frequency of the resampling, the dimension along which do it, and what operation apply. # In[ ]: ds # In[ ]: ds1Y = ds.resample(freq='1YS',dim='time',how = 'mean') ds1Y # In[ ]: ds1Y.sst[0].plot() # according to the warning # In[ ]: ds1Y = ds.resample(time='1YS').mean() ds1Y # In[ ]: ds.time # AS we mentioned in the following class, remember that February values are not weighted differently, so each month value is weighted equally because although xarray correctly "reads" the month from the datetime64 object, it does not calculate the interval between to subsequent values. They are just instantaneous time values for xarray. # The coordinate is still a datetime64 obejct, where you can use the usual attributes. # In[ ]: ds.time.dt.hour # In[ ]: ds['time.hour'] # In[ ]: ds1Y.time # In[ ]: ds1Y['time.hour'] # these two syntaxes give me the same thing # In[ ]: ds1Y['time.year'] # In[ ]: ds1Y.time.dt.year # ## rolling() # # allows for some operations to be performed on a rolling window. the window is whatever dimension you indicate. # To have a 3 monthly rolling mean, your data needs to be already in monthly format. # # In[ ]: ds3M = ds.rolling(time = 3).mean() # In[ ]: ds3M # In[ ]: ds3M.sst[0].plot() # In[ ]: ds3M.sst[2].plot() # ## groupby() # # works the same way as in pandas. It allows to group and perform operations on goruped data. # # my data are monthly indeed, so the grouping simply allows to access one month at the time and perform an operation on it. # In[ ]: ds.sst.groupby('time.month') # In[ ]: tempgp = ds.sst.groupby('time.month') # .groups lists the position of the grouped data, in this case the position of the corresponding month. tempgp.groups # In[ ]: ds.sst.groupby('time.month').mean().plot(marker='o') # In[ ]: ds.sst.groupby('time.month').mean(dim=['time', 'lon']) # here some other averages # In[ ]: ds.sst.groupby('time.month').mean(dim=['time', 'lon']).plot() # In[ ]: ds.sst.groupby('time.month').mean(dim=['time', 'lon']) # In[ ]: ds.sst.groupby('time.month').mean(dim=['time', 'lat']).plot() # In[ ]: ds.sst.groupby('time.month').mean(dim=['time', 'lat']) # As you can note, the dimension along which the operation is performed is dropped. We have a new time dimension that is month. # # now to access it, we can use the position (i.e. the 4th element will be month 4) or the new label "month" # In[ ]: ds.sst.groupby('time.month').mean(dim=['time'])[3].plot() # In[ ]: ds.sst.groupby('time.month').mean(dim=['time']).sel(month=4).plot()