Amazon Review Data With Images(2018)

原数据集地址

处理脚本

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
"""
得到含有Image字段的数据集
author: Orch1d
date: 2025-12-07
"""
import json
import os.path

data_root = './dataset'
save_dir = './new_dataset'

import logging
import colorlog
import fire


def setup_logger():
# 创建 handler
handler = colorlog.StreamHandler()

# 定义颜色格式
# log_color 会自动根据日志级别变为对应的颜色
formatter = colorlog.ColoredFormatter(
"%(log_color)s[%(asctime)s] %(levelname)-8s%(reset)s %(message)s",
datefmt="%Y-%m-%d %H:%M:%S",
reset=True,
log_colors={
'DEBUG': 'cyan',
'INFO': 'green',
'WARNING': 'yellow',
'ERROR': 'red',
'CRITICAL': 'red,bg_white',
},
secondary_log_colors={},
style='%'
)

handler.setFormatter(formatter)

logger = colorlog.getLogger('example')
if logger.hasHandlers():
return logger
logger.addHandler(handler)
logger.setLevel(logging.DEBUG)
logger.propagate = False
return logger


def main(**kwargs):
log = setup_logger()
log.info(kwargs)

name = kwargs.get('dataset')
if not name:
log.error("请通过命令行参数 '--dataset' 指定数据集名称")
return

input_file_path = os.path.join(data_root, name)
log.info(f"input_file_path: {input_file_path}")

if not os.path.exists(input_file_path):
log.error(f"path not exists: {input_file_path}")
return

output_file_path = os.path.join(save_dir, name + '_with_image.json')

try:
total_count = 0 # 原数据条数
records_with_image = []

with open(input_file_path, 'r', errors='ignore') as file:
for line_num, line in enumerate(file, 1):
line = line.strip()
if not line:
continue

total_count += 1

try:
js = json.loads(line)
if js.get('image') is not None:
records_with_image.append(js)
except json.JSONDecodeError as e:
log.error(f"failed to parse json in line {line_num}:{e}")
continue

log.info("=" * 40)
log.info(f"数据统计:")
log.info(f" - 原数据总条数: {total_count}")
log.info(f" - 包含'image'字段的条数: {len(records_with_image)}")
log.info(f" - 筛选比例: {len(records_with_image) / total_count * 100:.2f}%")
log.info("=" * 40)

if records_with_image:
with open(output_file_path, 'w', encoding='utf-8') as out_file:
for record in records_with_image:
out_file.write(json.dumps(record, ensure_ascii=False) + '\n')
log.info(f"saved in: {output_file_path}")
else:
log.warning("Image is not included in the raw data")

except Exception as e:
log.exception(f"Unexpected Exception: {e}")


if __name__ == '__main__':
fire.Fire(main)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
#!/bin/bash

save_dir="./new_dataset"
py_dir="./getImageDataset.py"

mkdir "$save_dir"

datasets=(
"All_Beauty.json"
"AMAZON_FASHION.json"
"Appliances.json"
"Arts_Crafts_and_Sewing.json"
"Automotive.json"
"Books.json"
"CDs_and_Vinyl.json"
"Cell_Phones_and_Accessories.json"
"Clothing_Shoes_and_Jewelry.json"
"Electronics.json"
"Grocery_and_Gourmet_Food.json"
"Home_and_Kitchen.json"
"Digital_Music.json"
"Gift_Cards.json"
"Industrial_and_Scientific.json"
"Kindle_Store.json"
"Luxury_Beauty.json"
"Magazine_Subscriptions.json"
"Movies_and_TV.json"
"Musical_Instruments_5.json"
"Office_Products_5.json"
"Patio_Lawn_and_Garden.json"
"Pet_Supplies.json"
"Prime_Pantry.json"
"Software.json"
"Sports_and_Outdoors.json"
"Tools_and_Home_Improvement.json"
"Toys_and_Games.json"
"Video_Games_5.json"
)



for dataset in "${datasets[@]}"; do
name=$(basename "$dataset")

echo "正在转换: $name"

python "$py_dir" --dataset "$dataset"
done

echo "所有任务已完成!文件保存在: $save_dir"

数据集

Amazon Fashion:(reviews (28807))
All Beauty:(reviews (8391))
Appiliances:(reviews (9258))
Arts Crafts and Sewing:(reviews (86106))
Automotive:(reviews (220180))
Books:(reviews (184269))
CDs and Vinyl:(reviews (17698))
Cell Phones and Accessories:(reviews (182305))
Clothing Shoes and Jewelry:(reviews (743397))
Digital Music:(reviews (6591))
Electronics:(reviews (348723))
Gift Cards:(reviews (406))
Grocery and Gourmet Food:(reviews (70935))
Home and Kitchen:(reviews (631633))
Industrial and Scientific:(reviews (32710))
Kindle Store:(reviews (6189))
Luxury Beauty:(reviews (7418))
Magazine Subscriptions:(reviews (135))
Movies and TV:(reviews (18346))
Musical Instruments_5:(reviews (954))
Office Products:(reviews (11302))
Patio Lawn and Garden:(reviews (133354))
Pet Supplies:(reviews (236884))
Prime Pantry:(reviews (3568))
Software:(reviews (1508))
Sports and Outdoors:(reviews (285607))
Tools and Home Improvement:(reviews (225482))
Toys and Games:(reviews (201978))
Video Games_5:(reviews (3634))
image-20251208140814713 image-20251208140654355 image-20251208140828919 image-20251208140839742 image-20251208145550601 image-20251208144501726 image-20251208142734850 image-20251208142802609 image-20251208143125271 image-20251208135628780 image-20251208143232966 image-20251208135651710 image-20251208143405479 image-20251208143425965 image-20251208135713079 image-20251208135745533 image-20251208135757290 image-20251208135812616 image-20251208135828048 image-20251208135846475 image-20251208135906003 image-20251208135916984 image-20251208135930851 image-20251208135942040 image-20251208135951977 image-20251208140002410 image-20251208140012410 image-20251208140027978 image-20251208140041150