Source code for podpac.datalib.cosmos_stations

from __future__ import division, unicode_literals, print_function, absolute_import

import re
import json
import logging
from six import string_types
from dateutil import parser
from io import StringIO

try:
    import cPickle  # Python 2.7
except:
    import _pickle as cPickle

import numpy as np
import traitlets as tl

# Optional dependencies
from lazy_import import lazy_module

bs4 = lazy_module("bs4")

import podpac
from podpac.core.utils import _get_from_url, cached_property
from podpac.data import DataSource
from podpac.compositor import TileCompositorRaw
from podpac.interpolators import InterpolationMixin

_logger = logging.getLogger(__name__)


def _convert_str_to_vals(properties):
    IGNORE_KEYS = ["sitenumber"]
    for k, v in properties.items():
        if not isinstance(v, string_types) or k in IGNORE_KEYS:
            continue
        try:
            if "," in v:
                properties[k] = tuple([float(vv) for vv in v.split(",")])
            else:
                properties[k] = float(v)
        except ValueError:
            try:
                properties[k] = np.datetime64(v)
            except ValueError:
                pass
    return properties


class COSMOSStation(DataSource):
    _repr_keys = ["label", "network", "location"]

    url = tl.Unicode("http://cosmos.hwr.arizona.edu/Probes/StationDat/")
    station_data = tl.Dict().tag(attr=True)

    @cached_property
    def raw_data(self):
        _logger.info("Downloading station data from {}".format(self.station_data_url))

        r = _get_from_url(self.station_data_url)
        if r is None:
            raise ConnectionError(
                "COSMOS data cannot be retrieved. Is the site {} down?".format(self.station_calibration_url)
            )
        return r.text

    @cached_property
    def data_columns(self):
        return self.raw_data.split("\n", 1)[0].split(" ")

    @property
    def site_number(self):
        return str(self.station_data["sitenumber"])

    @property
    def station_data_url(self):
        return self.url + self.site_number + "/smcounts.txt"

    @property
    def station_calibration_url(self):
        return self.url + self.site_number + "/calibrationInfo.php"

    @property
    def station_properties_url(self):
        return self.url + self.site_number + "/index.php"

    def get_data(self, coordinates, coordinates_index):
        data = np.loadtxt(StringIO(self.raw_data), skiprows=1, usecols=self.data_columns.index("SOILM"))[
            coordinates_index[0]
        ]
        data[data > 100] = np.nan
        data[data < 0] = np.nan
        data /= 100.0  # Make it fractional
        return self.create_output_array(coordinates, data=data.reshape(coordinates.shape))

    def get_coordinates(self):
        lat_lon = self.station_data["location"]
        time = np.atleast_2d(
            np.loadtxt(
                StringIO(self.raw_data),
                skiprows=1,
                usecols=[self.data_columns.index("YYYY-MM-DD"), self.data_columns.index("HH:MM")],
                dtype=str,
            )
        )
        if time.size == 0:
            time = np.datetime64("NaT")
        else:
            time = np.array([t[0] + "T" + t[1] for t in time], np.datetime64)
        c = podpac.Coordinates([time, [lat_lon[0], lat_lon[1]]], ["time", ["lat", "lon"]])
        return c

    @property
    def label(self):
        return self.station_data["label"]

    @property
    def network(self):
        return self.station_data["network"]

    @property
    def location(self):
        return self.station_data["location"]

    @cached_property(use_cache_ctrl=True)
    def calibration_data(self):
        cd = _get_from_url(self.station_calibration_url)
        if cd is None:
            raise ConnectionError(
                "COSMOS data cannot be retrieved. Is the site {} down?".format(self.station_calibration_url)
            )
        cd = cd.json()
        cd["items"] = [_convert_str_to_vals(i) for i in cd["items"]]
        return cd

    @cached_property(use_cache_ctrl=True)
    def site_properties(self):
        r = _get_from_url(self.station_properties_url)
        if r is None:
            raise ConnectionError(
                "COSMOS data cannot be retrieved. Is the site {} down?".format(self.station_properties_url)
            )
        soup = bs4.BeautifulSoup(r.text, "lxml")
        regex = re.compile("Soil Organic Carbon")
        loc = soup.body.findAll(text=regex)[0].parent.parent
        label, value = loc.findAll("div")
        labels = [l.strip() for l in label.children if "br" not in str(l)]
        values = [l.strip() for l in value.children if "br" not in str(l) and l.strip() != ""]

        properties = {k: v for k, v in zip(labels, values)}

        return _convert_str_to_vals(properties)


class COSMOSStationsRaw(TileCompositorRaw):
    url = tl.Unicode("http://cosmos.hwr.arizona.edu/Probes/")
    stations_url = tl.Unicode("sitesNoLegend.js")
    dims = ["lat", "lon", "time"]

    from podpac.style import Style

    style = Style(colormap="jet")

    ## PROPERTIES
    @cached_property(use_cache_ctrl=True)
    def _stations_data_raw(self):
        url = self.url + self.stations_url
        r = _get_from_url(url)
        if r is None:
            raise ConnectionError("COSMOS data cannot be retrieved. Is the site {} down?".format(url))

        t = r.text

        # Fix the JSON
        t_f = re.sub(':\s?",', ': "",', t)  # Missing closing parenthesis
        if t_f[-5:] == ",\n]}\n":  # errant comma
            t_f = t_f[:-5] + "\n]}\n"

        return t_f

    @cached_property
    def stations_data(self):
        stations = json.loads(self._stations_data_raw)
        stations["items"] = [_convert_str_to_vals(i) for i in stations["items"]]
        return stations

    @cached_property(use_cache_ctrl=True)
    def source_coordinates(self):
        lat_lon = np.array(self.stations_value("location"))[self.has_data]
        c = podpac.Coordinates([[lat_lon[:, 0], lat_lon[:, 1]]], ["lat_lon"])
        return c

    @cached_property
    def has_data(self):
        return ~(np.array(self.stations_value("lastdat")) == "YYYY-MM-DD")

    @cached_property
    def sources(self):
        return np.array([COSMOSStation(station_data=item) for item in self.stations_data["items"]])[self.has_data]

    @property
    def available_data_keys(self):
        return list(self.stations_data["items"][0].keys())

    ## UTILITY FUNCTIONS
    def stations_value(self, key, stations_data=None):
        """Returns a list of values for all the station for a particular key

        Parameters
        -----------
        key: str
           Key describing the station data. See self.available_data_keys for available keys.

        Returns
        --------
        list
            A list of the values for the keys for each station
        """
        if key not in self.available_data_keys:
            raise ValueError("Input key {} is not in available keys {}".format(key, self.available_data_keys))

        return self._stations_value(key, stations_data)

    def _stations_value(self, key, stations_data=None):
        """helper function for stations_value"""
        if stations_data is None:
            stations_data = self.stations_data

        return [i[key] for i in stations_data["items"]]

    @property
    def stations_label(self):
        return self.stations_value("label")

    def label_from_latlon(self, lat_lon):
        """Returns the COSMOS station's label given it's lat/lon coordinates

        Parameters
        -----------
        lat_lon : podpac.Coordinates
            The lat/lon locations whose station name will be returned. Note, the lat/lon coordinates have to match
            exactly the coordinates given in station_data[N]['location'], where N is the station.
            This should be Coordinates object with 'lat_lon' stacked coordinates as one of the dimensions.

        Returns
        --------
        list
            List of COSMOS station names corresponding to the given coordinates. If a coordinate has no match, then
            "None" is returned.
        """
        if "lon_lat" in lat_lon.dims:
            lat_lon = lat_lon.transpose("lon_lat")
        elif "lat_lon" not in lat_lon.dims:
            raise ValueError("The coordinates object must have a stacked 'lat_lon' dimension.")

        labels_map = {s["location"]: s["label"] for s in self.stations_data["items"]}
        labels = [labels_map.get(ll, None) for ll in lat_lon.xcoords["lat_lon"]]
        return labels

    def latlon_from_label(self, label):
        """Returns the lat/lon coordinates of COSMOS stations that match the given labels

        Parameters
        ------------
        label: str, list
            Strings that partially describe a COSMOS station label.

        Returns
        --------
        podpac.Coordinates
            The coordinates of the COSMOS stations matching the input data
        """
        if not isinstance(label, list):
            label = [label]

        ind = self._get_label_inds(label)
        if ind.size == 0:
            return podpac.Coordinates([])  # Empty

        lat_lon = np.array(self.stations_value("location"))[ind].squeeze()
        c = podpac.Coordinates([[lat_lon[0], lat_lon[1]]], ["lat_lon"])

        return c

    def _get_label_inds(self, label):
        """Helper function to get source indices for partially matched labels"""
        ind = []
        for lab in label:
            ind.extend([i for i, l in enumerate(self.stations_label) if lab.lower() in l.lower()])

        ind = np.unique(ind)
        return ind

    def get_calibration_data(self, label=None, lat_lon=None):
        """Returns the calibration information for a station. Users must supply a label or lat_lon coordinates.

        Parameters
        ------------
        label: str, List (optional)
            Labels describing the station.

        lat_lon: podpac.Coordinates (optional)
            Coordinates of the COSMOS station. Note, this object has to have a 'lat_lon' dimension which matches exactly
            with the COSMOS stations.

        Returns
        --------
        list
            A list of dictionaries containing the calibration data for the requested stations.
        """

        if label is None and lat_lon is None:
            raise ValueError("Must supply either 'label' or 'lat_lon'")

        if lat_lon is not None:
            label = self.label_from_latlon(lat_lon)

        if isinstance(label, string_types):
            label = [label]

        inds = self._get_label_inds(label)

        return [self.sources[i].calibration_data for i in inds]

    def get_site_properties(self, label=None, lat_lon=None):
        """Returns the site properties for a station. Users must supply a label or lat_lon coordinates.

        Parameters
        ------------
        label: str, List (optional)
            Labels describing the station.

        lat_lon: podpac.Coordinates (optional)
            Coordinates of the COSMOS station. Note, this object has to have a 'lat_lon' dimension which matches exactly
            with the COSMOS stations.

        Returns
        --------
        list
            A list of dictionaries containing the properties for the requested stations.
        """

        if label is None and lat_lon is None:
            raise ValueError("Must supply either 'label' or 'lat_lon'")

        if lat_lon is not None:
            label = self.label_from_latlon(lat_lon)

        if isinstance(label, string_types):
            label = [label]

        inds = self._get_label_inds(label)

        return [self.sources[i].site_properties for i in inds]

    def get_station_data(self, label=None, lat_lon=None):
        """Returns the station data. Users must supply a label or lat_lon coordinates.

        Parameters
        ------------
        label: str, List (optional)
            Labels describing the station.

        lat_lon: podpac.Coordinates (optional)
            Coordinates of the COSMOS station. Note, this object has to have a 'lat_lon' dimension which matches exactly
            with the COSMOS stations.

        Returns
        --------
        list
            A list of dictionaries containing the data for the requested stations.
        """

        if label is None and lat_lon is None:
            raise ValueError("Must supply either 'label' or 'lat_lon'")

        if lat_lon is not None:
            label = self.label_from_latlon(lat_lon)

        if isinstance(label, string_types):
            label = [label]

        inds = self._get_label_inds(label)

        return [self.stations_data["items"][i] for i in inds]


[docs]class COSMOSStations(InterpolationMixin, COSMOSStationsRaw): @tl.default("interpolation") def _interpolation_default(self): return {"method": "nearest", "params": {"use_selector": False, "remove_nan": False, "time_scale": "1,M"}}
if __name__ == "__main__": bounds = {"lat": [40, 46], "lon": [-78, -68]} cs = COSMOSStations( cache_ctrl=["ram", "disk"], interpolation={"method": "nearest", "params": {"use_selector": False, "remove_nan": True, "time_scale": "1,M"}}, ) csr = COSMOSStationsRaw( cache_ctrl=["ram", "disk"], interpolation={"method": "nearest", "params": {"use_selector": False, "remove_nan": True, "time_scale": "1,M"}}, ) sd = cs.stations_data ci = cs.source_coordinates.select(bounds) ce = podpac.coordinates.merge_dims( [podpac.Coordinates([podpac.crange("2018-05-01", "2018-06-01", "1,D", "time")]), ci] ) cg = podpac.Coordinates( [ podpac.clinspace(ci["lat"].bounds[1], ci["lat"].bounds[0], 12, "lat"), podpac.clinspace(ci["lon"].bounds[1], ci["lon"].bounds[0], 16, "lon"), ce["time"], ] ) o = cs.eval(ce) o_r = csr.eval(ce) og = cs.eval(cg) # Test helper functions labels = cs.stations_label lat_lon = cs.latlon_from_label("Manitou") labels = cs.label_from_latlon(lat_lon) lat_lon2 = cs.latlon_from_label("No Match Here") cal = cs.get_calibration_data("Manitou") props = cs.get_site_properties("Manitou") from matplotlib import rcParams rcParams["axes.labelsize"] = 12 rcParams["xtick.labelsize"] = 10 rcParams["ytick.labelsize"] = 10 rcParams["legend.fontsize"] = 8 rcParams["lines.linewidth"] = 2 rcParams["font.size"] = 12 import matplotlib.pyplot as plt import matplotlib.dates as mdates from pandas.plotting import register_matplotlib_converters register_matplotlib_converters() fig = plt.figure(figsize=(6.5, 3), dpi=300) plt.plot(o.time, o.data, "o-") ax = plt.gca() plt.ylim(0, 1) plt.legend(cs.label_from_latlon(ce)) # plt.plot(o_r.time, o_r.data, ".-") plt.ylabel("Soil Moisture ($m^3/m^3$)") plt.xlabel("Date") # plt.xticks(rotation=90) fig.autofmt_xdate() ax.fmt_xdata = mdates.DateFormatter("%m-%d") plt.title("COSMOS Data for 2018 over lat (40, 46) by lon (-78,-68)") plt.tight_layout() plt.show() print("Done")