import os import numpy as np import matplotlib.pyplot as plt import cv2 from collections import Counter import random from PIL import Image import pandas as pd import seaborn as sns # 设置中文字体支持 plt.rcParams['font.sans-serif'] = ['SimHei'] # 用来正常显示中文标签 plt.rcParams['axes.unicode_minus'] = False # 用来正常显示负号 # 数据集路径 TRAIN_SET_PATH = 'TrainSet' TEST_SET_PATH = 'TestSetA' # 标签文件路径 train_label_path = os.path.join(TRAIN_SET_PATH, 'labels', 'train.txt') val_label_path = os.path.join(TRAIN_SET_PATH, 'labels', 'val.txt') trainval_label_path = os.path.join(TRAIN_SET_PATH, 'labels', 'trainval.txt') # 图像路径 train_image_dir = os.path.join(TRAIN_SET_PATH, 'images', 'train') test_image_dir = TEST_SET_PATH # 读取标签文件 def read_label_file(file_path): labels = {} with open(file_path, 'r') as f: for line in f: img_name, label = line.strip().split() labels[img_name] = int(label) return labels # 读取训练集和验证集标签 train_labels = read_label_file(train_label_path) val_labels = read_label_file(val_label_path) trainval_labels = read_label_file(trainval_label_path) # 统计各个类别的数量 def count_classes(labels): return Counter(labels.values()) # 获取图像尺寸分布 def get_image_sizes(image_dir, sample_size=100): image_files = os.listdir(image_dir) if len(image_files) > sample_size: image_files = random.sample(image_files, sample_size) sizes = [] for img_file in image_files: img_path = os.path.join(image_dir, img_file) try: img = Image.open(img_path) sizes.append(img.size) except Exception as e: print(f"Error processing {img_file}: {e}") return sizes # 可视化类别分布 def plot_class_distribution(train_counts, val_counts, title="类别分布"): plt.figure(figsize=(12, 6)) # 创建类别标签映射 class_names = { 0: "类别0", 1: "类别1", 2: "类别2", 3: "类别3", 4: "类别4", 5: "类别5" } # 准备数据 classes = list(set(list(train_counts.keys()) + list(val_counts.keys()))) # 计算每个类别的总样本数(训练集+验证集) total_counts = {cls: train_counts.get(cls, 0) + val_counts.get(cls, 0) for cls in classes} # 按照样本总数从大到小排序类别 classes = sorted(classes, key=lambda x: total_counts[x], reverse=True) train_values = [train_counts.get(cls, 0) for cls in classes] val_values = [val_counts.get(cls, 0) for cls in classes] # 创建DataFrame df = pd.DataFrame({ '训练集': train_values, '验证集': val_values }, index=[class_names.get(cls, f"类别{cls}") for cls in classes]) # 绘制堆叠柱状图 ax = df.plot(kind='bar', stacked=False, figsize=(12, 6)) plt.title(title, fontsize=16) plt.xlabel('类别', fontsize=14) plt.ylabel('样本数量', fontsize=14) plt.xticks(rotation=0) # 添加数值标签 for container in ax.containers: ax.bar_label(container, fmt='%d') plt.tight_layout() plt.savefig('类别分布.png', dpi=300, bbox_inches='tight') plt.show() # 可视化图像尺寸分布 def plot_image_size_distribution(sizes): # 提取宽度和高度 widths, heights = zip(*sizes) plt.figure(figsize=(12, 6)) # 创建散点图 plt.scatter(widths, heights, alpha=0.5) plt.title('图像尺寸分布', fontsize=16) plt.xlabel('宽度 (像素)', fontsize=14) plt.ylabel('高度 (像素)', fontsize=14) plt.grid(True, linestyle='--', alpha=0.7) # 添加常见分辨率标记 common_resolutions = [(640, 480), (800, 600), (1024, 768), (1280, 720), (1920, 1080)] for w, h in common_resolutions: plt.plot(w, h, 'ro', markersize=10, alpha=0.3) plt.annotate(f'{w}x{h}', (w, h), xytext=(10, 10), textcoords='offset points') plt.tight_layout() plt.savefig('图像尺寸分布.png', dpi=300, bbox_inches='tight') plt.show() # 显示每个类别的样本图像 def show_sample_images(image_dir, labels, num_per_class=3): # 按类别分组图像 class_images = {} for img_name, label in labels.items(): if label not in class_images: class_images[label] = [] class_images[label].append(img_name) # 为每个类别选择样本 samples = {} for label, images in class_images.items(): if len(images) >= num_per_class: samples[label] = random.sample(images, num_per_class) else: samples[label] = images # 创建图像网格 num_classes = len(samples) fig, axes = plt.subplots(num_classes, num_per_class, figsize=(num_per_class*4, num_classes*4)) # 类别名称映射 class_names = { 0: "类别0", 1: "类别1", 2: "类别2", 3: "类别3", 4: "类别4", 5: "类别5" } # 显示图像 for i, (label, img_names) in enumerate(sorted(samples.items())): for j, img_name in enumerate(img_names): img_path = os.path.join(image_dir, img_name) img = cv2.imread(img_path) img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB) if num_classes == 1: ax = axes[j] else: ax = axes[i, j] ax.imshow(img) ax.set_title(f"{class_names.get(label, f'类别{label}')}") ax.axis('off') plt.tight_layout() plt.savefig('类别样本图像.png', dpi=300, bbox_inches='tight') plt.show() # 分析类别不平衡性 def analyze_class_imbalance(train_counts, val_counts): # 合并训练集和验证集的计数 total_counts = {} for cls in set(list(train_counts.keys()) + list(val_counts.keys())): total_counts[cls] = train_counts.get(cls, 0) + val_counts.get(cls, 0) # 计算每个类别的比例 total_samples = sum(total_counts.values()) class_ratios = {cls: count/total_samples for cls, count in total_counts.items()} # 创建饼图 plt.figure(figsize=(10, 10)) # 按照样本数量从大到小排序类别 sorted_classes = sorted(class_ratios.keys(), key=lambda x: total_counts[x], reverse=True) labels = [f"类别{cls} ({class_ratios[cls]:.1%})" for cls in sorted_classes] sizes = [total_counts[cls] for cls in sorted_classes] # 使用不同的颜色 colors = plt.cm.tab10(np.arange(len(sizes)) % 10) # 突出显示最大和最小的类别 explode = [0.1 if (cls == max(total_counts, key=total_counts.get) or cls == min(total_counts, key=total_counts.get)) else 0 for cls in sorted(class_ratios.keys())] plt.pie(sizes, explode=explode, labels=labels, colors=colors, autopct='%1.1f%%', shadow=True, startangle=140, textprops={'fontsize': 12}) plt.axis('equal') # 保持饼图为圆形 plt.title('数据集类别分布比例', fontsize=16) plt.tight_layout() plt.savefig('类别比例.png', dpi=300, bbox_inches='tight') plt.show() # 主函数 def main(): print("开始分析数据集...") # 统计类别分布 train_counts = count_classes(train_labels) val_counts = count_classes(val_labels) print("\n训练集类别分布(按样本数量从大到小排序):") for cls, count in sorted(train_counts.items(), key=lambda x: x[1], reverse=True): print(f"类别 {cls}: {count} 样本 ({count/len(train_labels):.1%})") print("\n验证集类别分布(按样本数量从大到小排序):") for cls, count in sorted(val_counts.items(), key=lambda x: x[1], reverse=True): print(f"类别 {cls}: {count} 样本 ({count/len(val_labels):.1%})") # 可视化类别分布 plot_class_distribution(train_counts, val_counts) # 分析类别不平衡性 analyze_class_imbalance(train_counts, val_counts) # 获取图像尺寸分布 print("\n分析图像尺寸分布...") image_sizes = get_image_sizes(train_image_dir) # 可视化图像尺寸分布 plot_image_size_distribution(image_sizes) # 显示每个类别的样本图像 print("\n显示每个类别的样本图像...") show_sample_images(train_image_dir, train_labels) print("\n数据集分析完成!") if __name__ == "__main__": main()