NRT data#

import time

import matplotlib.pyplot as plt
import pandas as pd

import uscrn

Recent hourly data#

With uscrn.get_nrt_data(), we can load recent data (near-real-time) from USCRN by specifying the period we want and from which dataset. Here, we request the 6 most recent files.

now = pd.Timestamp.now("UTC")
print(now)

df = uscrn.get_nrt_data(
    (-6, None),
    "hourly",
    n_jobs=2,
)
Hide code cell output
2024-05-09 23:42:13.962376+00:00
Discovering files...
  Looking for files in these years
  - 2024
Found 6 file(s) to load
https://www.ncei.noaa.gov/pub/data/uscrn/products/hourly02/updates/2024/CRN60H0203-202405091700.txt
...
https://www.ncei.noaa.gov/pub/data/uscrn/products/hourly02/updates/2024/CRN60H0203-202405092200.txt
Reading files...
[Parallel(n_jobs=2)]: Using backend LokyBackend with 2 concurrent workers.
[Parallel(n_jobs=2)]: Done   1 tasks      | elapsed:    1.9s
[Parallel(n_jobs=2)]: Done   4 out of   6 | elapsed:    2.2s remaining:    1.1s
[Parallel(n_jobs=2)]: Done   6 out of   6 | elapsed:    2.3s finished
df
wban utc_time lst_time crx_vn longitude latitude t_calc t_hr_avg t_max t_min ... soil_moisture_5 soil_moisture_10 soil_moisture_20 soil_moisture_50 soil_moisture_100 soil_temp_5 soil_temp_10 soil_temp_20 soil_temp_50 soil_temp_100
0 03047 2024-05-09 16:00:00 2024-05-09 10:00:00 2.622 -102.809998 31.620001 28.900000 27.799999 29.000000 26.700001 ... 0.044 0.027 0.042 0.032 0.031 27.799999 27.299999 25.000000 27.299999 25.900000
1 03048 2024-05-09 16:00:00 2024-05-09 09:00:00 2.622 -106.889999 34.360001 18.200001 16.900000 18.200001 15.300000 ... 0.047 0.075 0.072 0.067 0.068 21.500000 19.900000 21.299999 22.100000 21.799999
2 03054 2024-05-09 16:00:00 2024-05-09 10:00:00 2.622 -102.769997 33.959999 20.900000 19.600000 21.000000 18.600000 ... 0.063 0.111 0.134 0.112 0.134 21.700001 20.299999 21.299999 21.500000 18.600000
3 03055 2024-05-09 16:00:00 2024-05-09 10:00:00 2.622 -101.589996 36.599998 18.799999 18.299999 19.400000 17.000000 ... 0.082 0.138 0.140 0.234 0.144 15.900000 16.000000 16.100000 15.900000 15.600000
4 03060 2024-05-09 16:00:00 2024-05-09 09:00:00 2.622 -107.690002 38.540001 5.200000 3.600000 5.200000 1.700000 ... 0.216 0.292 0.295 0.336 0.367 5.300000 5.300000 6.000000 6.200000 5.700000
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
930 96405 2024-05-09 21:00:00 2024-05-09 12:00:00 2.514 -145.350006 60.470001 6.200000 6.100000 6.400000 5.700000 ... NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN
931 96406 2024-05-09 21:00:00 2024-05-09 12:00:00 2.515 -154.130005 64.500000 4.500000 3.900000 5.000000 2.900000 ... NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN
932 96407 2024-05-09 21:00:00 2024-05-09 12:00:00 2.515 -159.000000 66.559998 -0.800000 -1.000000 -0.700000 -1.400000 ... NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN
933 96408 2024-05-09 21:00:00 2024-05-09 12:00:00 2.515 -150.869995 63.450001 NaN NaN NaN NaN ... NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN
934 96409 2024-05-09 21:00:00 2024-05-09 12:00:00 2.514 -149.399994 68.650002 5.500000 5.000000 5.500000 4.200000 ... NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN

935 rows × 36 columns

Sometimes, a few sites can have times an hour earlier than the others. See the notes in uscrn.get_nrt_data() for more details.

def func(x):
    nx = len(x)
    if nx == 0:
        return ""
    elif nx < 10:
        return sorted(x.unique())
    else:
        return f"..."

(
    df.utc_time
    .value_counts()
    .sort_index()
    .to_frame()
    .assign(
        wbans=df.groupby("utc_time")["wban"].apply(func),
    )
)
count wbans
utc_time
2024-05-09 16:00:00 156 ...
2024-05-09 17:00:00 157 ...
2024-05-09 18:00:00 157 ...
2024-05-09 19:00:00 155 ...
2024-05-09 20:00:00 155 ...
2024-05-09 21:00:00 155 ...

In these files, for example, site WBAN 13301 has data for multiple earlier days included.

Plot temperature change time series#

fig, ax = plt.subplots(figsize=(7, 4))

df_ = df.copy()

tmax = df_.utc_time.max()
df_ = df_[df_.utc_time.between(tmax - pd.Timedelta("7h"), tmax)]

df_["utc_time_mid"] = df_["utc_time"] + pd.Timedelta("30min")
df_["t_hr_avg_k"] = df_["t_hr_avg"].add(273.15)
df_["dt_hr_avg"] = df_["t_hr_avg_k"].sub(df_.groupby("wban")["t_hr_avg_k"].transform("mean"))

df_[["utc_time", "dt_hr_avg"]].groupby("utc_time").mean().plot(
    color="0.3",
    linewidth=3,
    zorder=10,
    legend=False,
    ax=ax,
)

df_.groupby("wban").plot(
    x="utc_time",
    y="dt_hr_avg",
    color="0.5",
    linewidth=1,
    alpha=0.4,
    legend=False,
    xlabel="Time (UTC)",
    ylabel="NRT temperature anomaly  $\Delta T$  (°C)",
    ax=ax,
)

ax.set_title(df.attrs["title"], loc="left", size=8);
../_images/25115e9a43b37996d17731f379242ece039aebc8119a784c33d77530479e4b03.png

Plot current temperature#

fig, ax = plt.subplots(figsize=(7, 4.5))

ds = uscrn.to_xarray(df)

ds.isel(time=-1).plot.scatter(x="longitude", y="latitude", hue="t_hr_avg", ax=ax);
../_images/5d1e373e91405d560c28feafc40684c32ff903aa7a6b4a544d47b93d498f8736.png

Specific period of hourly data#

Date selection works by file, not by the data inside the file. In general, the data is an hour behind the file date/time. See the notes in uscrn.get_nrt_data() for more details.

def get_nrt_hourly_period(period):
    a, b = period
    ap1 = pd.to_datetime(a) + pd.Timedelta(hours=1)
    bp1 = pd.to_datetime(b) + pd.Timedelta(hours=1)

    df = uscrn.get_nrt_data((ap1, bp1))

    time.sleep(0.5)  # for prints
    in_period = df.utc_time.between(a, b)
    print(
        f"Got {in_period.sum()}/{len(df)} ({in_period.sum() / len(df):.1%}) records "
        f"in desired period {a} to {b}"
    )
    outside = df.loc[~in_period, "utc_time"].value_counts()
    print(
        "Outside counts:",
        ", ".join(f"{time:%Y-%m-%d %H} ({count})" for time, count in outside.items())
    )

    dupe = df.duplicated(["wban", "utc_time"], keep=False)
    print(f"Got {dupe.sum()} ({dupe.sum() / len(df):.1%}) duplicates")

    return (
        df[in_period]
        .drop_duplicates(["wban", "utc_time"], keep="last")
        .reset_index(drop=True)
    )


df = get_nrt_hourly_period(("2024-02-09 16", "2024-02-09 20"))
Hide code cell output
/home/docs/checkouts/readthedocs.org/user_builds/uscrn/envs/latest/lib/python3.10/site-packages/uscrn/data.py:712: UserWarning: Timestamp 2024-02-09 17:00:00 has no timezone, assuming UTC.
  warnings.warn(f"Timestamp {ts} has no timezone, assuming UTC.")
/home/docs/checkouts/readthedocs.org/user_builds/uscrn/envs/latest/lib/python3.10/site-packages/uscrn/data.py:712: UserWarning: Timestamp 2024-02-09 21:00:00 has no timezone, assuming UTC.
  warnings.warn(f"Timestamp {ts} has no timezone, assuming UTC.")
Discovering files...
  Looking for files in these years
  - 2024
Found 5 file(s) to load
https://www.ncei.noaa.gov/pub/data/uscrn/products/hourly02/updates/2024/CRN60H0203-202402091700.txt
...
https://www.ncei.noaa.gov/pub/data/uscrn/products/hourly02/updates/2024/CRN60H0203-202402092100.txt
Reading files...
[Parallel(n_jobs=1)]: Done   1 tasks      | elapsed:    0.2s
[Parallel(n_jobs=1)]: Done   4 tasks      | elapsed:    0.8s
Got 769/771 (99.7%) records in desired period 2024-02-09 16 to 2024-02-09 20
Outside counts: 2024-02-09 15 (2)
Got 6 (0.8%) duplicates
df
wban utc_time lst_time crx_vn longitude latitude t_calc t_hr_avg t_max t_min ... soil_moisture_5 soil_moisture_10 soil_moisture_20 soil_moisture_50 soil_moisture_100 soil_temp_5 soil_temp_10 soil_temp_20 soil_temp_50 soil_temp_100
0 03047 2024-02-09 16:00:00 2024-02-09 10:00:00 2.622 -102.809998 31.620001 11.300000 10.0 11.300000 8.300000 ... 0.048 0.038 0.056 0.045 0.035 9.3 9.1 9.5 12.1 13.3
1 03048 2024-02-09 16:00:00 2024-02-09 09:00:00 2.622 -106.889999 34.360001 3.000000 2.3 3.100000 0.900000 ... 0.101 0.140 0.109 0.056 0.057 3.1 4.4 6.2 8.2 9.2
2 03054 2024-02-09 16:00:00 2024-02-09 10:00:00 2.622 -102.769997 33.959999 6.700000 6.5 6.700000 5.900000 ... 0.132 0.160 0.164 0.105 0.125 5.0 5.8 6.9 9.0 10.3
3 03055 2024-02-09 16:00:00 2024-02-09 10:00:00 2.622 -101.589996 36.599998 8.700000 6.7 8.700000 3.900000 ... 0.250 0.306 0.324 0.412 0.217 4.5 5.0 5.6 6.4 6.6
4 03060 2024-02-09 16:00:00 2024-02-09 09:00:00 2.622 -107.690002 38.540001 -7.400000 -8.4 -7.400000 -10.400000 ... NaN NaN NaN 0.173 0.152 -0.3 -0.3 -0.1 0.7 2.0
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
761 94996 2024-02-09 20:00:00 2024-02-09 14:00:00 2.622 -96.849998 40.700001 8.200000 7.8 8.400000 7.000000 ... 0.290 0.330 0.287 0.260 0.322 5.1 4.4 4.8 5.2 5.5
762 96404 2024-02-09 20:00:00 2024-02-09 11:00:00 2.514 -141.210007 62.740002 -21.700001 -22.9 -21.700001 -24.200001 ... NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN
763 96405 2024-02-09 20:00:00 2024-02-09 11:00:00 2.514 -145.350006 60.470001 3.000000 2.9 3.200000 2.600000 ... NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN
764 96407 2024-02-09 20:00:00 2024-02-09 11:00:00 2.515 -159.000000 66.559998 -9.700000 -9.3 -8.700000 -11.100000 ... NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN
765 96409 2024-02-09 20:00:00 2024-02-09 11:00:00 2.514 -149.399994 68.650002 -10.800000 -9.6 -8.000000 -11.100000 ... NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN

766 rows × 36 columns

Recent daily data#

Here, we load the most recent daily data file.

df = uscrn.get_nrt_data(
    -1,
    "daily",
    n_jobs=1,
)
Hide code cell output
Discovering files...
  Looking for files in these years
  - 2024
Found 1 file(s) to load
https://www.ncei.noaa.gov/pub/data/uscrn/products/daily01/updates/2024/CRND0103-202405082359.txt
Reading files...
[Parallel(n_jobs=1)]: Done   1 tasks      | elapsed:    0.2s
[Parallel(n_jobs=1)]: Done   1 tasks      | elapsed:    0.2s
df
wban lst_date crx_vn longitude latitude t_daily_max t_daily_min t_daily_mean t_daily_avg p_daily_calc ... soil_moisture_5_daily soil_moisture_10_daily soil_moisture_20_daily soil_moisture_50_daily soil_moisture_100_daily soil_temp_5_daily soil_temp_10_daily soil_temp_20_daily soil_temp_50_daily soil_temp_100_daily
0 03047 2024-05-08 2.622 -102.809998 31.620001 34.700001 18.4 26.600000 27.000000 0.0 ... 0.050 0.032 0.040 0.033 0.031 30.000000 30.200001 29.700001 27.200001 25.5
1 03048 2024-05-08 2.622 -106.889999 34.360001 26.700001 12.1 19.400000 19.299999 0.0 ... 0.050 0.078 0.074 0.068 0.069 25.900000 24.900000 23.900000 22.000000 21.6
2 03054 2024-05-08 2.622 -102.769997 33.959999 30.200001 9.5 19.799999 20.600000 0.0 ... 0.068 0.114 0.135 0.112 0.134 24.000000 23.200001 22.299999 21.200001 18.6
3 03055 2024-05-08 2.622 -101.589996 36.599998 25.200001 6.4 15.800000 16.000000 0.0 ... 0.088 0.142 0.142 0.243 0.146 17.700001 17.600000 16.500000 15.800000 15.5
4 03060 2024-05-08 2.622 -107.690002 38.540001 6.600000 -6.2 0.200000 0.400000 0.0 ... 0.224 0.297 0.296 0.325 0.366 7.700000 7.200000 6.500000 6.000000 5.7
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
151 96405 2024-05-08 2.514 -145.350006 60.470001 7.400000 3.3 5.400000 5.200000 10.5 ... NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN
152 96406 2024-05-08 2.515 -154.130005 64.500000 14.500000 -0.9 6.800000 6.500000 0.0 ... NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN
153 96407 2024-05-08 2.515 -159.000000 66.559998 0.600000 -7.2 -3.300000 -2.900000 0.0 ... NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN
154 96408 2024-05-08 2.515 -150.869995 63.450001 NaN NaN NaN NaN 0.7 ... NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN
155 96409 2024-05-08 2.514 -149.399994 68.650002 -7.300000 -9.9 -8.600000 -8.400000 0.2 ... NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN

156 rows × 28 columns

fig, ax = plt.subplots(figsize=(7, 4.5))

ds = uscrn.to_xarray(df).squeeze()

ds.plot.scatter(x="longitude", y="latitude", hue="t_daily_max", ax=ax);
../_images/4eee45a2a65d7a1565d91ad32cbb279c42f955203e98fc440ad7b951377c8f0f.png