|
- import os
- import numpy as np
- import matplotlib.pyplot as plt
- import cv2
- from collections import Counter
- import random
- from PIL import Image
- import pandas as pd
- import seaborn as sns
-
- # 设置中文字体支持
- plt.rcParams['font.sans-serif'] = ['SimHei'] # 用来正常显示中文标签
- plt.rcParams['axes.unicode_minus'] = False # 用来正常显示负号
-
- # 数据集路径
- TRAIN_SET_PATH = 'TrainSet'
- TEST_SET_PATH = 'TestSetA'
-
- # 标签文件路径
- train_label_path = os.path.join(TRAIN_SET_PATH, 'labels', 'train.txt')
- val_label_path = os.path.join(TRAIN_SET_PATH, 'labels', 'val.txt')
- trainval_label_path = os.path.join(TRAIN_SET_PATH, 'labels', 'trainval.txt')
-
- # 图像路径
- train_image_dir = os.path.join(TRAIN_SET_PATH, 'images', 'train')
- test_image_dir = TEST_SET_PATH
-
- # 读取标签文件
- def read_label_file(file_path):
- labels = {}
- with open(file_path, 'r') as f:
- for line in f:
- img_name, label = line.strip().split()
- labels[img_name] = int(label)
- return labels
-
- # 读取训练集和验证集标签
- train_labels = read_label_file(train_label_path)
- val_labels = read_label_file(val_label_path)
- trainval_labels = read_label_file(trainval_label_path)
-
- # 统计各个类别的数量
- def count_classes(labels):
- return Counter(labels.values())
-
- # 获取图像尺寸分布
- def get_image_sizes(image_dir, sample_size=100):
- image_files = os.listdir(image_dir)
- if len(image_files) > sample_size:
- image_files = random.sample(image_files, sample_size)
-
- sizes = []
- for img_file in image_files:
- img_path = os.path.join(image_dir, img_file)
- try:
- img = Image.open(img_path)
- sizes.append(img.size)
- except Exception as e:
- print(f"Error processing {img_file}: {e}")
-
- return sizes
-
- # 可视化类别分布
- def plot_class_distribution(train_counts, val_counts, title="类别分布"):
- plt.figure(figsize=(12, 6))
-
- # 创建类别标签映射
- class_names = {
- 0: "类别0",
- 1: "类别1",
- 2: "类别2",
- 3: "类别3",
- 4: "类别4",
- 5: "类别5"
- }
-
- # 准备数据
- classes = list(set(list(train_counts.keys()) + list(val_counts.keys())))
- # 计算每个类别的总样本数(训练集+验证集)
- total_counts = {cls: train_counts.get(cls, 0) + val_counts.get(cls, 0) for cls in classes}
- # 按照样本总数从大到小排序类别
- classes = sorted(classes, key=lambda x: total_counts[x], reverse=True)
- train_values = [train_counts.get(cls, 0) for cls in classes]
- val_values = [val_counts.get(cls, 0) for cls in classes]
-
- # 创建DataFrame
- df = pd.DataFrame({
- '训练集': train_values,
- '验证集': val_values
- }, index=[class_names.get(cls, f"类别{cls}") for cls in classes])
-
- # 绘制堆叠柱状图
- ax = df.plot(kind='bar', stacked=False, figsize=(12, 6))
- plt.title(title, fontsize=16)
- plt.xlabel('类别', fontsize=14)
- plt.ylabel('样本数量', fontsize=14)
- plt.xticks(rotation=0)
-
- # 添加数值标签
- for container in ax.containers:
- ax.bar_label(container, fmt='%d')
-
- plt.tight_layout()
- plt.savefig('类别分布.png', dpi=300, bbox_inches='tight')
- plt.show()
-
- # 可视化图像尺寸分布
- def plot_image_size_distribution(sizes):
- # 提取宽度和高度
- widths, heights = zip(*sizes)
-
- plt.figure(figsize=(12, 6))
-
- # 创建散点图
- plt.scatter(widths, heights, alpha=0.5)
- plt.title('图像尺寸分布', fontsize=16)
- plt.xlabel('宽度 (像素)', fontsize=14)
- plt.ylabel('高度 (像素)', fontsize=14)
- plt.grid(True, linestyle='--', alpha=0.7)
-
- # 添加常见分辨率标记
- common_resolutions = [(640, 480), (800, 600), (1024, 768), (1280, 720), (1920, 1080)]
- for w, h in common_resolutions:
- plt.plot(w, h, 'ro', markersize=10, alpha=0.3)
- plt.annotate(f'{w}x{h}', (w, h), xytext=(10, 10), textcoords='offset points')
-
- plt.tight_layout()
- plt.savefig('图像尺寸分布.png', dpi=300, bbox_inches='tight')
- plt.show()
-
- # 显示每个类别的样本图像
- def show_sample_images(image_dir, labels, num_per_class=3):
- # 按类别分组图像
- class_images = {}
- for img_name, label in labels.items():
- if label not in class_images:
- class_images[label] = []
- class_images[label].append(img_name)
-
- # 为每个类别选择样本
- samples = {}
- for label, images in class_images.items():
- if len(images) >= num_per_class:
- samples[label] = random.sample(images, num_per_class)
- else:
- samples[label] = images
-
- # 创建图像网格
- num_classes = len(samples)
- fig, axes = plt.subplots(num_classes, num_per_class, figsize=(num_per_class*4, num_classes*4))
-
- # 类别名称映射
- class_names = {
- 0: "类别0",
- 1: "类别1",
- 2: "类别2",
- 3: "类别3",
- 4: "类别4",
- 5: "类别5"
- }
-
- # 显示图像
- for i, (label, img_names) in enumerate(sorted(samples.items())):
- for j, img_name in enumerate(img_names):
- img_path = os.path.join(image_dir, img_name)
- img = cv2.imread(img_path)
- img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)
-
- if num_classes == 1:
- ax = axes[j]
- else:
- ax = axes[i, j]
-
- ax.imshow(img)
- ax.set_title(f"{class_names.get(label, f'类别{label}')}")
- ax.axis('off')
-
- plt.tight_layout()
- plt.savefig('类别样本图像.png', dpi=300, bbox_inches='tight')
- plt.show()
-
- # 分析类别不平衡性
- def analyze_class_imbalance(train_counts, val_counts):
- # 合并训练集和验证集的计数
- total_counts = {}
- for cls in set(list(train_counts.keys()) + list(val_counts.keys())):
- total_counts[cls] = train_counts.get(cls, 0) + val_counts.get(cls, 0)
-
- # 计算每个类别的比例
- total_samples = sum(total_counts.values())
- class_ratios = {cls: count/total_samples for cls, count in total_counts.items()}
-
- # 创建饼图
- plt.figure(figsize=(10, 10))
- # 按照样本数量从大到小排序类别
- sorted_classes = sorted(class_ratios.keys(), key=lambda x: total_counts[x], reverse=True)
- labels = [f"类别{cls} ({class_ratios[cls]:.1%})" for cls in sorted_classes]
- sizes = [total_counts[cls] for cls in sorted_classes]
-
- # 使用不同的颜色
- colors = plt.cm.tab10(np.arange(len(sizes)) % 10)
-
- # 突出显示最大和最小的类别
- explode = [0.1 if (cls == max(total_counts, key=total_counts.get) or
- cls == min(total_counts, key=total_counts.get))
- else 0 for cls in sorted(class_ratios.keys())]
-
- plt.pie(sizes, explode=explode, labels=labels, colors=colors, autopct='%1.1f%%',
- shadow=True, startangle=140, textprops={'fontsize': 12})
- plt.axis('equal') # 保持饼图为圆形
- plt.title('数据集类别分布比例', fontsize=16)
- plt.tight_layout()
- plt.savefig('类别比例.png', dpi=300, bbox_inches='tight')
- plt.show()
-
- # 主函数
- def main():
- print("开始分析数据集...")
-
- # 统计类别分布
- train_counts = count_classes(train_labels)
- val_counts = count_classes(val_labels)
-
- print("\n训练集类别分布(按样本数量从大到小排序):")
- for cls, count in sorted(train_counts.items(), key=lambda x: x[1], reverse=True):
- print(f"类别 {cls}: {count} 样本 ({count/len(train_labels):.1%})")
-
- print("\n验证集类别分布(按样本数量从大到小排序):")
- for cls, count in sorted(val_counts.items(), key=lambda x: x[1], reverse=True):
- print(f"类别 {cls}: {count} 样本 ({count/len(val_labels):.1%})")
-
- # 可视化类别分布
- plot_class_distribution(train_counts, val_counts)
-
- # 分析类别不平衡性
- analyze_class_imbalance(train_counts, val_counts)
-
- # 获取图像尺寸分布
- print("\n分析图像尺寸分布...")
- image_sizes = get_image_sizes(train_image_dir)
-
- # 可视化图像尺寸分布
- plot_image_size_distribution(image_sizes)
-
- # 显示每个类别的样本图像
- print("\n显示每个类别的样本图像...")
- show_sample_images(train_image_dir, train_labels)
-
- print("\n数据集分析完成!")
-
- if __name__ == "__main__":
- main()
|