testing
This commit is contained in:
91
hsmm/real_data.py
Normal file
91
hsmm/real_data.py
Normal file
@@ -0,0 +1,91 @@
|
||||
import torch
|
||||
from torch.utils.data import Dataset
|
||||
import numpy as np
|
||||
import os
|
||||
from sklearn.decomposition import PCA
|
||||
|
||||
class RealAudioDataset(Dataset):
|
||||
def __init__(self, npy_path, len_path=None, crop_len=None, pca_dim=None, pca_model=None):
|
||||
"""
|
||||
npy_path: Path to the huge .npy file
|
||||
len_path: Path to the .lengths file (optional, tries to infer if None)
|
||||
crop_len: If set (e.g., 200), we randomly crop sequences to this length for training.
|
||||
pca_dim: If set (e.g., 30), we learn/apply PCA reduction.
|
||||
"""
|
||||
# 1. Load Data (Memory Mapped to save RAM)
|
||||
if not os.path.exists(npy_path):
|
||||
raise FileNotFoundError(f"Could not find {npy_path}")
|
||||
|
||||
self.data = np.load(npy_path, mmap_mode='r')
|
||||
self.input_dim = self.data.shape[1]
|
||||
|
||||
# 2. Load Lengths
|
||||
if len_path is None:
|
||||
# Assume .lengths is next to .npy
|
||||
len_path = npy_path.replace('.npy', '.lengths')
|
||||
|
||||
if not os.path.exists(len_path):
|
||||
raise FileNotFoundError(f"Could not find length file: {len_path}")
|
||||
|
||||
with open(len_path, 'r') as f:
|
||||
self.lengths = [int(x) for x in f.read().strip().split()]
|
||||
|
||||
# Create Offsets (Where each sentence starts in the flat file)
|
||||
self.offsets = np.cumsum([0] + self.lengths[:-1])
|
||||
self.n_samples = len(self.lengths)
|
||||
self.crop_len = crop_len
|
||||
|
||||
print(f"Loaded Dataset: {self.n_samples} files. Dim: {self.input_dim}")
|
||||
|
||||
# 3. Handle PCA
|
||||
self.pca = pca_model
|
||||
if pca_dim is not None and self.input_dim > pca_dim:
|
||||
if self.pca is None:
|
||||
print(f"Fitting PCA to reduce dim from {self.input_dim} -> {pca_dim}...")
|
||||
# Fit on a subset (first 100k frames) to be fast
|
||||
subset_size = min(len(self.data), 100000)
|
||||
subset = self.data[:subset_size]
|
||||
self.pca = PCA(n_components=pca_dim)
|
||||
self.pca.fit(subset)
|
||||
print("PCA Fit Complete.")
|
||||
else:
|
||||
print("Using provided PCA model.")
|
||||
|
||||
def __len__(self):
|
||||
return self.n_samples
|
||||
|
||||
def __getitem__(self, idx):
|
||||
# 1. Locate the sentence
|
||||
start = self.offsets[idx]
|
||||
length = self.lengths[idx]
|
||||
|
||||
# 2. Extract Data
|
||||
# If training (crop_len set), pick a random window
|
||||
if self.crop_len and length > self.crop_len:
|
||||
# Random Offset
|
||||
max_start = length - self.crop_len
|
||||
offset = np.random.randint(0, max_start + 1)
|
||||
|
||||
# Slice the mmap array
|
||||
raw_seq = self.data[start+offset : start+offset+self.crop_len]
|
||||
else:
|
||||
# Validation/Inference (Return full sequence)
|
||||
# Note: Batch size must be 1 for variable lengths!
|
||||
raw_seq = self.data[start : start+length]
|
||||
|
||||
# 3. Apply PCA (On the fly)
|
||||
if self.pca is not None:
|
||||
raw_seq = self.pca.transform(raw_seq)
|
||||
|
||||
# 4. Convert to Tensor
|
||||
return torch.tensor(raw_seq, dtype=torch.float32)
|
||||
|
||||
def get_real_dataloaders(npy_path, batch_size, crop_len=200, pca_dim=30):
|
||||
# 1. Training Set (Random Crops)
|
||||
train_ds = RealAudioDataset(npy_path, crop_len=crop_len, pca_dim=pca_dim)
|
||||
|
||||
# 2. Validation Set (Full Sequences, Shared PCA)
|
||||
# We use batch_size=1 because lengths vary!
|
||||
val_ds = RealAudioDataset(npy_path, crop_len=None, pca_dim=pca_dim, pca_model=train_ds.pca)
|
||||
|
||||
return train_ds, val_ds
|
||||
Reference in New Issue
Block a user