# coding: utf-8 # # Details about Time Series handling and Indexing and Selecting data in Xarray # In[1]: import numpy as np import xarray as xr from matplotlib import pyplot as plt get_ipython().run_line_magic('matplotlib', 'inline') import pandas as pd # We will work with some data available through the xarray packages (yes the loading has a different syntax, this is valid only for data in the tutorial method) # In[2]: # load a sample dataset ds = xr.tutorial.load_dataset('air_temperature') # In[3]: ds # In[4]: ds.lat # In[5]: ds.time[1] # # Time series data # ### important to read here # http://xarray.pydata.org/en/stable/time-series.html # # In[6]: ds # In[7]: ds.time.dt.month # In[8]: ds.time.dt.hour # In[9]: ds.time.dt.season # In[10]: ds.groupby('time.season').mean('time') # However, datetime is an instantaneous value, so when certain operations are done on not equally spaced intervals (think about yearly average starting from monthly scale, February months are not long as March) The length of the DELTA interval is not considered. Rather it will correctly match Feb with Feb and so on, but it won't weight February differently .... Same thing when you do a decadal mean and you have leap and not leap years (this is almost never done, but in case your research requires this, have a look at the example below). # # A rather elaborate example of how to create these weights is here # # http://xarray.pydata.org/en/stable/examples/monthly-means.html # # This is essentially in line with doing lat and lon averages... # # # Indexing and selecting # In[11]: type(ds) # In[12]: ds # In[13]: da = ds['air'] type(da) # Now I have a dataset and a dataarray # numpy style indexing still works (but preserves the labels/metadata when we plot it for example) # # Note how i have to add the ":" for the first dimension that I am not selecting through. # In[14]: da[:, 10, 20].plot() # Positional indexing using dimension names - remember it is POSITIONAL, so it won't use longitude equal 20, but the 21th value of longitude # # isel lookup by integer # In[15]: da.lon[20] # In[16]: da.isel(lon=20, lat=10 ).plot() # More interesting is Label-based indexing - you don't need to know the position # # sel does lookup by label (label can be any datatype, in our case we have datetime64 and lat/lon that are float32) # In[17]: type(da.lat.values[0]) # In[18]: da.sel(lat=50., lon=250.).plot() # When you select a point, nearest neighbor is easily done # # Nearest neighbor lookups # In[19]: da.sel(lat=52.25, lon=251.8998, method='nearest').plot() # Indexing by integer array indices (I use isel) # In[20]: print(da.lat[0]) print(da.lon[0]) print(da.time[0:3]) # ### slice() allows you to - ahem - slice the data. It has a different behavious whether you use it in isel, or sel, inherited from Panads and Numpy # In isel() it's not inclusive of the last value (similarly to numpy indexing: # # array[0:3] # # won't include the fourth [position 3] value # In[21]: print( da.lat[0:2]) print( da.lon[0]) print( da.time[0:2]) # In[22]: da.isel(lat=slice(0,2), lon=0, time=slice(0, 2)) # Using sel, instead, slice it is inclusive # In[23]: # index by dimension coordinate labels da.sel(lat=slice(75,71), lon=200, time=slice('2013-01-01', '2013-01-01T06:00:00')) # Please note how the slicing along latitude and time included whatever is in between and equal to the start and end of the slicing. # # Also note, latitude is ordered in the opposite way: # In[24]: da.lat # Because of this, slicing without taking this into account will give me an empty latitude dimension # In[25]: da.sel(lat=slice(71,75), lon=200, time=slice('2013-01-01', '2013-01-01T06:00:00')) # Method Nearest doesn't work when slice() is used, however you can always split up the selection if you need to use the method=nearest for one of the dimesnnions # In[26]: da.sel(lat=71, lon=199, method='nearest').sel(time=slice('2013-01-01', '2013-01-01T06:00:00')) # # Drop # # it is used usually to drop a variable altogether, it can also be used to drop a dimension # # it works for both dataset and dataarray # In[27]: arr = xr.DataArray(np.random.rand(4, 3), [('time', pd.date_range('2000-01-01', periods=4)), ('space', ['IA', 'IL', 'IN'])]) arr # In[28]: arr.drop(['IN', 'IL'], dim='space') # In[29]: dsarr = arr.to_dataset(name='foo') dsarr # In[30]: dsarr.drop(['IN', 'IL'], dim='space') # # Masking with where # # As mentioned before, where "masks" the data # In[31]: ds ds.air.mean(dim=['lat','lon']).plot() # In[32]: ds.air.mean(dim=['lat','lon']).plot() ds.air.mean(dim=['lat','lon']).where(ds.time.dt.month==4).plot() #da.sel(lat=75, lon=200, time=slice('2013-01-01', '2013-01-01T06:00:00')) # In[33]: dsmeantime = ds.air.mean(dim=['time']) dsmeantime.plot() # In[34]: dsmeantime.where(dsmeantime.lat>40).plot() # In[35]: dsmeantime.where(dsmeantime.lat>40&dsmeantime.lat<60&dsmeantime.lon>220&dsmeantime.lon<300).plot() # In[36]: dsmeantime.where((dsmeantime.lat>40)&(dsmeantime.lat<60)&(dsmeantime.lon>220)&(dsmeantime.lon<300)).plot() # # Vectorized indexing # you can supply DataArray() objects as indexers. Dimensions on resultant arrays are given by the ordered union of the indexers’ dimensions: # In[37]: # generate a coordinates for a transect of points lat_points = xr.DataArray([52, 60, 75], dims='points') lon_points = xr.DataArray([250, 250, 250], dims='points') lat_points # nearest neighbor selection along the transect, in this case the order doesn't matter, these are points # In[38]: da.sel(lat=lat_points, lon=lon_points, method='nearest') # There is much more to this but I am still exploring the pros and cons # # Assign values to a dataarray # # this theme is not well described in the help page, and I will try and update that. In the meanwhile below find some examples # In[39]: davi = xr.DataArray(np.arange(12).reshape((3, 4)), dims=['x', 'y'], coords={'x': [0, 1, 2], 'y': ['a', 'b', 'c', 'd']}) davi # you can use numpy like assignment # In[40]: # davi[0,:,:]= -1 davi[0]= -1 davi # In[41]: davi[0,1]= -2 davi # But if you want to select an area? and assign a value to it? # # After some researching I have found that there are two ways to do it. # # First, let's see what doesn't work: # # 1) using where() # In[42]: davi.where((davi.x==2)&(davi.y=='b'))=100 davi # 2) using isel() # In[43]: davi.isel(x=0)=100 davi # 3) using sel() # In[44]: davi.sel(x=2, y='c') =2000 # 4) in some cases it will fail silently (chain indexing) # In[45]: dafs = xr.DataArray([10, 11, 12, 13], dims=['x']) # In[46]: dafs.isel(x=[0, 1, 2]) # In[47]: dafs.isel(x=[0, 1, 2])[1] # In[48]: dafs.isel(x=[0, 1, 2])[1] +=1 # So - what does it work? # # you have two options, one using loc()+dictionary of the values you want to select and assign values to, or # # xr.where() - this xr.where() is different from dataarray.where(), http://xarray.pydata.org/en/stable/generated/xarray.where.html # In[49]: #add an empty 2D dataarray ds['empty']= xr.full_like(ds.air.mean('time'),fill_value=0) ds # In[50]: #modify one grid point, using where() or loc() ds['empty'] = xr.where((ds.coords['lat']==20)&(ds.coords['lon']==260), 100, ds['empty']) ds.empty.plot() # In[51]: dict(lon=260, lat=30) # In[52]: ds['empty'].loc[dict(lon=260, lat=30)] = 100 ds.empty.plot() # In[ ]: #modify an area with where() and a mask mask = (ds.coords['lat']>20)&(ds.coords['lat']<60)&(ds.coords['lon']>220)&(ds.coords['lon']<260) ds['empty'] = xr.where(mask, 100, ds['empty']) ds.empty.plot() # In[ ]: #modify an area with loc() lc = ds.coords['lon'] la = ds.coords['lat'] ds['empty'].loc[dict(lon=lc[(lc>290)&(lc<300)], lat=la[(la>40)&(la<60)])] = 100 ds.empty.plot()