我在这里处理这个函数,它产生所需的输出。我只想确定我是以一种明智的方式去做事情。
它只是对数据进行有效的排序,同时考虑到不同公司可能发起的活动。然后,它聚合数据。
from collections import defaultdict
def aggregate_ads(ad_data, labels=list(), default_advertiser='internal'):
# Creates dicts to hold data, structured to provide code-readability
ads_data = defaultdict(
lambda: defaultdict(
lambda: defaultdict(int)
))
# Lowercases all labels
labels = map(str.lower, labels)
# Sorts each instance into its channel and adds
for adgroup in ad_data:
# Cleans and standardizes campaign name
campaign_name = adgroup['campaign']['name']
campaign_name = campaign_name.replace('-', ' ').replace('_', ' ').lower()
# Handling where ad_group type is not provided
if not adgroup['ad_group'].get('type_'):
adgroup['ad_group']['type_'] = 'MIXED'
# Collects channel and metrics
channel = adgroup['ad_group']['type_']
metrics = dict(
impressions= int(adgroup['metrics']['impressions']),
clicks = int(adgroup['metrics']['clicks']),
# Converts cost in microns to usd
cost = round(int(adgroup['metrics']['cost_micros'])/1000000, 2),
)
# Checks for labels in campaign name and defaults to specified default
advertisers = set(labels).intersection(campaign_name.split())
if not advertisers:
advertisers.add(default_advertiser)
# Adds the variables to ads_data
for advertiser in advertisers:
ads_data[advertiser][channel]['impressions'] += metrics['impressions']
ads_data[advertiser][channel]['clicks'] += metrics['clicks']
ads_data[advertiser][channel]['cost'] += metrics['cost']
# Converts into regular dict on return
return dict(
(advertiser, dict((ad_type, dict(metrics))
for ad_type, metrics in ad_data.items()))
for advertiser, ad_data in ads_data.items())这是我的输出:
{'internal': {'DISPLAY_STANDARD': {'clicks': 163,
'cost': 11.8,
'impressions': 6785},
'MIXED': {'clicks': 6, 'cost': 0.1, 'impressions': 434},
'SEARCH_STANDARD': {'clicks': 2,
'cost': 5.89,
'impressions': 151}},
'play': {'MIXED': {'clicks': 5, 'cost': 0.05, 'impressions': 242}}}和一些样本输入:
example = [
{'campaign': {'resource_name': 'blahblahbah',
'serving_status': 'SERVING',
'name': 'Google Play Market-USA/Canada-2022-08',
'start_date': '2022-07-20',
'end_date': '2037-12-30'},
'ad_group': {'resource_name': 'blahblahbah',
'status': 'ENABLED',
'type_': 'MIXED'},
'metrics': {'clicks': '5', 'cost_micros': '54238', 'impressions': '242'},
'ad_group_ad': {'resource_name': 'blahblahbah',
'status': 'ENABLED',
'ad': {'resource_name': 'blahblahbah'}},
'segments': {'date': '2022-10-11'}},
{'campaign': {'resource_name': 'blahblahbah',
'serving_status': 'SERVING',
'name': 'Google Play Market-USA/Canada-2022-08',
'start_date': '2022-07-20',
'end_date': '2037-12-30'},
'ad_group': {'resource_name': 'blahblahbah',
'status': 'ENABLED',
'type_': 'MIXED'},
'metrics': {'clicks': '3', 'cost_micros': '53943', 'impressions': '217'},
'ad_group_ad': {'resource_name': 'blahblahbah',
'status': 'ENABLED',
'ad': {'resource_name': 'blahblahbah'}},
'segments': {'date': '2022-10-11'}},
{'campaign': {'resource_name': 'blahblahbah',
'serving_status': 'SERVING',
'name': 'Google Play Market-USA/Canada-2022-08',
'start_date': '2022-07-20',
'end_date': '2037-12-30'},
'ad_group': {'resource_name': 'blahblahbah',
'status': 'ENABLED',
'type_': 'MIXED'},
'metrics': {'clicks': '3', 'cost_micros': '53943', 'impressions': '217'},
'ad_group_ad': {'resource_name': 'blahblahbah',
'status': 'ENABLED',
'ad': {'resource_name': 'blahblahbah'}},
'segments': {'date': '2022-10-11'}},
{'campaign': {'resource_name': 'blahblahbah',
'serving_status': 'SERVING',
'name': 'Display-Global-Desktop-202208',
'start_date': '2022-07-21',
'end_date': '2037-12-30'},
'ad_group': {'resource_name': 'blahblahbah',
'status': 'ENABLED',
'type_': 'DISPLAY_STANDARD'},
'metrics': {'clicks': '95', 'cost_micros': '6036546', 'impressions': '4186'},
'ad_group_ad': {'resource_name': 'blahblahbah',
'status': 'ENABLED',
'ad': {'resource_name': 'blahblahbah'}},
'segments': {'date': '2022-10-11'}},
{'campaign': {'resource_name': 'blahblahbah',
'serving_status': 'SERVING',
'name': 'Search-USA/NOTES',
'start_date': '2022-08-30',
'end_date': '2037-12-30'},
'ad_group': {'resource_name': 'blahblahbah',
'status': 'ENABLED',
'type_': 'SEARCH_STANDARD'},
'metrics': {'clicks': '2', 'cost_micros': '5890000', 'impressions': '151'},
'ad_group_ad': {'resource_name': 'blahblahbah',
'status': 'ENABLED',
'ad': {'resource_name': 'blahblahbah'}},
'segments': {'date': '2022-10-11'}},
{'campaign': {'resource_name': 'blahblahbah',
'serving_status': 'SERVING',
'name': 'Display-Global--Desktop-Files',
'start_date': '2022-09-02',
'end_date': '2037-12-30'},
'ad_group': {'resource_name': 'blahblahbah',
'status': 'ENABLED',
'type_': 'DISPLAY_STANDARD'},
'metrics': {'clicks': '68', 'cost_micros': '5757098', 'impressions': '2599'},
'ad_group_ad': {'resource_name': 'blahblahbah',
'status': 'ENABLED',
'ad': {'resource_name': 'blahblahbah'}},
'segments': {'date': '2022-10-11'}}
]labels = ['play'] # In reality, this is partner labels they'd put in the campaign name
aggregate_ads(example, labels)发布于 2022-10-12 21:34:28
你有个严重的错误。您将一个map分配给labels,但它从未实现,因此会被消耗,随后看起来就像一个空集合。
我建议您放弃大部分实现,用Pandas替换它,这非常适合您的情况。入口点将通过pd.json_normalize放置示例代码,这将为示例数据提供一个有六行的数据框架。
用Pandas to_dict替换dict生成器,并以矢量化的方式执行和,而不是在循环中执行求和,代码可能如下所示:
from pprint import pprint
import pandas as pd
def aggregate_ads(ad_data: dict, labels: set[str], default_advertiser: str = 'internal') -> pd.DataFrame:
df = pd.json_normalize(ad_data).astype({
'metrics.clicks': int,
'metrics.cost_micros': int,
'metrics.impressions': int,
})
df['channel'] = df['ad_group.type_'].fillna('MIXED')
df['metrics.cost'] = df['metrics.cost_micros'] / 1e6
campaign_fragments = (
df['campaign.name']
.str.replace('-', ' ').str.replace('_', ' ').str.lower().str.split()
.apply(set).apply(labels.__and__)
)
campaign_fragments[campaign_fragments == set()] = default_advertiser
df['advertisers'] = campaign_fragments
return (
df.explode('advertisers')
.groupby(['advertisers', 'channel'])
['metrics.impressions', 'metrics.clicks', 'metrics.cost']
.sum()
.rename(columns={
'metrics.impressions': 'impressions',
'metrics.clicks': 'clicks',
'metrics.cost': 'cost',
})
)
def ads_to_json(groups: pd.DataFrame) -> dict:
inners = groups.groupby(level=0).apply(
lambda df: df.droplevel(0).to_dict('index'))
return inners.to_dict()
def test() -> None:
example = [
{'campaign': {'resource_name': 'blahblahbah',
'serving_status': 'SERVING',
'name': 'Google Play Market-USA/Canada-2022-08',
'start_date': '2022-07-20',
'end_date': '2037-12-30'},
'ad_group': {'resource_name': 'blahblahbah',
'status': 'ENABLED',
'type_': 'MIXED'},
'metrics': {'clicks': '5', 'cost_micros': '54238', 'impressions': '242'},
'ad_group_ad': {'resource_name': 'blahblahbah',
'status': 'ENABLED',
'ad': {'resource_name': 'blahblahbah'}},
'segments': {'date': '2022-10-11'}},
{'campaign': {'resource_name': 'blahblahbah',
'serving_status': 'SERVING',
'name': 'Google Play Market-USA/Canada-2022-08',
'start_date': '2022-07-20',
'end_date': '2037-12-30'},
'ad_group': {'resource_name': 'blahblahbah',
'status': 'ENABLED',
'type_': 'MIXED'},
'metrics': {'clicks': '3', 'cost_micros': '53943', 'impressions': '217'},
'ad_group_ad': {'resource_name': 'blahblahbah',
'status': 'ENABLED',
'ad': {'resource_name': 'blahblahbah'}},
'segments': {'date': '2022-10-11'}},
{'campaign': {'resource_name': 'blahblahbah',
'serving_status': 'SERVING',
'name': 'Google Play Market-USA/Canada-2022-08',
'start_date': '2022-07-20',
'end_date': '2037-12-30'},
'ad_group': {'resource_name': 'blahblahbah',
'status': 'ENABLED',
'type_': 'MIXED'},
'metrics': {'clicks': '3', 'cost_micros': '53943', 'impressions': '217'},
'ad_group_ad': {'resource_name': 'blahblahbah',
'status': 'ENABLED',
'ad': {'resource_name': 'blahblahbah'}},
'segments': {'date': '2022-10-11'}},
{'campaign': {'resource_name': 'blahblahbah',
'serving_status': 'SERVING',
'name': 'Display-Global-Desktop-202208',
'start_date': '2022-07-21',
'end_date': '2037-12-30'},
'ad_group': {'resource_name': 'blahblahbah',
'status': 'ENABLED',
'type_': 'DISPLAY_STANDARD'},
'metrics': {'clicks': '95', 'cost_micros': '6036546', 'impressions': '4186'},
'ad_group_ad': {'resource_name': 'blahblahbah',
'status': 'ENABLED',
'ad': {'resource_name': 'blahblahbah'}},
'segments': {'date': '2022-10-11'}},
{'campaign': {'resource_name': 'blahblahbah',
'serving_status': 'SERVING',
'name': 'Search-USA/NOTES',
'start_date': '2022-08-30',
'end_date': '2037-12-30'},
'ad_group': {'resource_name': 'blahblahbah',
'status': 'ENABLED',
'type_': 'SEARCH_STANDARD'},
'metrics': {'clicks': '2', 'cost_micros': '5890000', 'impressions': '151'},
'ad_group_ad': {'resource_name': 'blahblahbah',
'status': 'ENABLED',
'ad': {'resource_name': 'blahblahbah'}},
'segments': {'date': '2022-10-11'}},
{'campaign': {'resource_name': 'blahblahbah',
'serving_status': 'SERVING',
'name': 'Display-Global--Desktop-Files',
'start_date': '2022-09-02',
'end_date': '2037-12-30'},
'ad_group': {'resource_name': 'blahblahbah',
'status': 'ENABLED',
'type_': 'DISPLAY_STANDARD'},
'metrics': {'clicks': '68', 'cost_micros': '5757098', 'impressions': '2599'},
'ad_group_ad': {'resource_name': 'blahblahbah',
'status': 'ENABLED',
'ad': {'resource_name': 'blahblahbah'}},
'segments': {'date': '2022-10-11'}}
]
grouped = aggregate_ads(example, labels={'play'})
js = ads_to_json(grouped)
pprint(js)
if __name__ == '__main__':
test()https://codereview.stackexchange.com/questions/280395
复制相似问题