import os import shutil import random from pathlib import Path # Configuration source_dir = "litter_detection_augmented" output_base = "litter_detection_split" train_ratio = 0.8 validate_ratio = 0.1 test_ratio = 0.1 # Create output directories split_names = ["train", "validate", "test"] class_names = ["positive", "negative"] for split in split_names: for cls in class_names: os.makedirs(os.path.join(output_base, split, cls), exist_ok=True) # Process each class for cls in class_names: class_dir = os.path.join(source_dir, cls) if not os.path.exists(class_dir): print(f"Warning: {class_dir} not found, skipping...") continue # Get all images images = [f for f in os.listdir(class_dir) if os.path.isfile(os.path.join(class_dir, f))] images = [f for f in images if f.lower().endswith(('.jpg', '.jpeg', '.png', '.bmp', '.gif'))] # Shuffle randomly random.shuffle(images) # Calculate split indices total = len(images) train_count = int(total * train_ratio) validate_count = int(total * validate_ratio) train_images = images[:train_count] validate_images = images[train_count:train_count + validate_count] test_images = images[train_count + validate_count:] print(f"\n{cls.upper()} images: {total}") print(f" Train: {len(train_images)} ({train_ratio*100:.0f}%)") print(f" Validate: {len(validate_images)} ({validate_ratio*100:.0f}%)") print(f" Test: {len(test_images)} ({test_ratio*100:.0f}%)") # Copy files to split directories splits = [ ("train", train_images), ("validate", validate_images), ("test", test_images) ] for split_name, split_images in splits: for img in split_images: src = os.path.join(class_dir, img) dst = os.path.join(output_base, split_name, cls, img) shutil.copy2(src, dst) print(f"\nDone! Dataset split into {output_base}") print(f"Structure:") for split in split_names: print(f" {split}/") for cls in class_names: count = len(os.listdir(os.path.join(output_base, split, cls))) print(f" {cls}: {count} images")