从航站楼预报中解析天空条件的DataFrame方法。
taf中的一行可以报告零八层云层.云层在占优势线中是必需的,在临时行中是可选的。云覆盖SKC|FEW|SCT|BKN|OVC与八度值相关联。1, 3, 5, 8作为最小天空覆盖度来报告一层。
我很难找到一个纯粹的regex解决方案来生成重复捕获组所需的模式。因此,_unpack_setup函数
from typing import Iterable
import re
import pandas as pd
import numpy as np
TAF = """
KGCC 282320Z 2900/2924 09010KT P6SM -SHRA BKN070 OVC250
FM290300 24011KT P6SM OVC040
TEMPO 2903/2906 4SM -SHRA FEW010 FEW015 BKN020TCU OVC025
FM291000 18009KT 3SM -TSRA BR OVC004CB
FM291900 31022G33KT 6SM -SHRA OVC011
"""
OCTAVE_INDEX = pd.Series(
(np.inf, 1, 3, 5, 8, np.nan), index=["SKC", "FEW", "SCT", "BKN", "OVC", np.nan]
)
def _unpack_setup():
base = r"(SKC|FEW|SCT|BKN|OVC)(\d{3})?(CB|TCU)?\s?"
layers = f"(?:{base})?" * 7
columns = pd.Series(["CloudCover", "CloudBase", "Flags"])
return (
re.compile(base + layers, re.VERBOSE),
pd.concat(columns + str(i) for i in range(1, 9)),
)
celestial_dome, cloud_columns = _unpack_setup()
def unpack_index(index: pd.Index, *args: str) -> Iterable[pd.Index]:
for col in args:
yield index[index.str.contains(col)]
def octave(sky_coverage: pd.Series) -> np.ndarray:
"""octave indexer"""
return OCTAVE_INDEX[sky_coverage].values
def get_sky_condition():
"""creates sky condtion dataframe"""
series = pd.Series(re.split(r"(?:\s(?=BECMG|TEMPO|FM))", TAF.strip())).str.strip()
sky_condition: pd.DataFrame = (
series.str.extract(celestial_dome)
.set_axis(cloud_columns, axis=1)
.dropna(axis=1, how="all")
)
column_base, column_cover = unpack_index(
sky_condition.columns, "CloudBase", "CloudCover"
)
sky_condition[column_base] = sky_condition[column_base].astype(float) * 100
sky_condition[column_cover] = sky_condition[column_cover].apply(octave)
print(sky_condition)
if __name__ == "__main__":
get_sky_condition() CloudCover1 CloudBase1 Flags1 CloudCover2 CloudBase2 CloudCover3 CloudBase3 Flags3 CloudCover4 CloudBase4
0 5.0 7000.0 NaN 8.0 25000.0 NaN NaN NaN NaN NaN
1 8.0 4000.0 NaN NaN NaN NaN NaN NaN NaN NaN
2 2.0 1000.0 NaN 2.0 1500.0 5.0 2000.0 TCU 8.0 2500.0
3 8.0 400.0 CB NaN NaN NaN NaN NaN NaN NaN
4 8.0 1100.0 NaN NaN NaN NaN NaN NaN NaN NaN发布于 2022-04-29 07:16:10
在get_sky_condition()的开头,我不明白为什么在定义series时执行.str.strip():
系列= pd.Series(re.split(r"(?:\s(?=BECMG|TEMPO|FM))",TAF.strip()).str.strip()
我觉得这就足够了吗?
series = pd.Series(re.split(r"(?:\s(?=BECMG|TEMPO|FM))", TAF.strip()))对于正则表达式,可以利用命名捕获组来避免调用.set_axis(cloud_columns, axis=1)来命名列。
def cloud_layers_re() -> re:
layer_re_fmt = \
r"(?P<CloudCover{0}>SKC|FEW|SCT|BKN|OVC)" \
r"(?P<CloudBase{0}>\d{{3}})?" \
r"(?P<Flags{0}>CB|TCU)?"
return re.compile(
layer_re_fmt.format(1) +
"".join("(?:\s+" + layer_re_fmt.format(i) + ")?" for i in range(2, 9))
)
⋮
def get_sky_condition():
"""creates sky condtion dataframe"""
series = pd.Series(re.split(r"(?:\s(?=BECMG|TEMPO|FM))", TAF.strip()))
sky_condition: pd.DataFrame = (
series.str.extract(cloud_layers_re())
.dropna(axis=1, how="all")
)
⋮因为get_sky_condition()被命名为getter函数,所以我希望它返回其结果,而不是打印它。
发布于 2022-05-01 16:58:05
你的指数又惹上麻烦了。我认为您的数据的形状明显错误地描述了您的数据实际上在说什么:
除以上情况外,每高度都有一些云层。
每当你说"per“时,就应该有一个MultiIndex级别。不要写CloudCover1、CloudCover2等列。一个两阶段的extract可以为你做到这一点。会有两个不同的数据格式,因为有两个不同的基数。换句话说,可见性测量的数量与云测量的数量有很大的不同,用数据库的话说,将它们混合到同一个数据中是没有意义的,而且是去正常化的。这两个单独的数据格式将有一些共同的索引级别。
import re
import pandas as pd
# Based on https://aviationweather.gov/taf/decoder#Forecast
TAF_PATTERN = re.compile(
r'''(?x) # verbose
^\s* # beginning, strip whitespace
(?P<group>[A-Z]+)? # time group kind, greedy, optional
(?: # non-capture: separator between group name and time
(?<!FM)\s+ # spaces for every group except FM
)?
(?P<time>\d\S+) # group time, starting with any digit, greedy, mandatory
(?: # non-capture: wind speed with separator, optional
\s+ # at least one separator space
(?P<wind>\S*KT) # anything followed by knots, greedy
)?
(?: # non-capture: visibilitity with separator, optional
\s+ # at least one separator space
(?P<vis> # visibility
P? # "more than"
\d+ # distance figure
(?:SM)? # unit: 'statute miles' or implied metres
)
)?
(?: # non-capture: weather with separator, optional
\s+ # at least one separator space
(?P<weather>
(?:\+|-|VC)? # intensity or proximity
(?: # weather fragments, mandatory, greedy
\s* # any spaces between weather fragments
(?:
MI|BC|DR|BL|SH|TS|FZ|PR| # Qualifier descriptor
DZ|RA|SN|SG|IC|PL|GR|GS|UP| # Precipitation
BR|FG|FU|DU|SA|HZ|PY|VA| # Obscuration
PO|SQ|FC|\+FC|SS|DS # "Other"
)
)+
)
)?
(?P<clouds> # cloud measurements, optional
(?: # non-capture: clouds, mandatory, greedy, multiple included
\s+ # at least one separator space
(?: # cloud density measured in "octals" (eighths)
VV|NSC|SKC|NCD|CLR|FEW|SCT|BKN|OVC
)
\d* # observation altitude in hundreds of feet
(?:CB|TCU)? # clouds, optional, cumulonimbus or towering cumulus
)+
)?
# Don't specify the rest, and don't match on the end. This may exclude
# wind shear, probability, etc.
'''
)
CLOUD_PATTERN = re.compile(
r'''(?x) # verbose
(?P<density> # cloud density measured in "octals" (eighths)
VV|NSC|SKC|NCD|CLR|FEW|SCT|BKN|OVC
)
(?P<altitude> # observation altitude in hundreds of feet, greedy, optional
\d+
)?
(?P<kind> # cloud kind, cumulonimbus or towering cumulus, optional
CB|TCU
)?
'''
)
def get_sky_condition(taf: str) -> tuple[
pd.DataFrame, # Groups
pd.DataFrame, # Clouds
]:
station, origin_time, body = taf.split(maxsplit=2)
lines = pd.Series(body.splitlines())
df: pd.DataFrame = lines.str.extract(TAF_PATTERN)
df['station'] = station
df['origin_time'] = origin_time
df.set_index(['station', 'origin_time', 'group', 'time'], inplace=True)
clouds: pd.DataFrame = df.clouds.str.extractall(CLOUD_PATTERN)
clouds['altitude'] = clouds.altitude.astype(int) * 100
clouds = clouds.droplevel('match').set_index('altitude', append=True)
df.drop(columns=['clouds'], inplace=True)
return df, clouds
def test() -> None:
taf = """
KGCC 282320Z 2900/2924 09010KT P6SM -SHRA BKN070 OVC250
FM290300 24011KT P6SM OVC040
TEMPO 2903/2906 4SM -SHRA FEW010 FEW015 BKN020TCU OVC025
FM291000 18009KT 3SM -TSRA BR OVC004CB
FM291900 31022G33KT 6SM -SHRA OVC011
TEMPO 2903/2906 5000 TSRA
"""
group_df, cloud_df = get_sky_condition(taf)
print('Groups:')
print(group_df)
print('Clouds:')
print(cloud_df)
if __name__ == "__main__":
test()Groups:
wind vis weather
station origin_time group time
KGCC 282320Z NaN 2900/2924 09010KT P6SM -SHRA
FM 290300 24011KT P6SM NaN
TEMPO 2903/2906 NaN 4SM -SHRA
FM 291000 18009KT 3SM -TSRA BR
291900 31022G33KT 6SM -SHRA
TEMPO 2903/2906 NaN 5000 TSRA
Clouds:
density kind
station origin_time group time altitude
KGCC 282320Z NaN 2900/2924 7000 BKN NaN
25000 OVC NaN
FM 290300 4000 OVC NaN
TEMPO 2903/2906 1000 FEW NaN
1500 FEW NaN
2000 BKN TCU
2500 OVC NaN
FM 291000 400 OVC CB
291900 1100 OVC NaNhttps://codereview.stackexchange.com/questions/276142
复制相似问题