关于np.random.choice(50, num_peaks, replace=False)的使用场景总结-程序员充电站

测试：np.random.choice(50, num_peaks, replace=False)

import numpy as np # 示例1：基础用法 num_peaks = 5 result = np.random.choice(50, num_peaks, replace=False) print(f"示例1结果: {result}") print(f"结果类型: {type(result)}, 形状: {result.shape}") print(f"是否有重复: {len(set(result)) != len(result)}") # 示例2：多次运行观察 print("\n示例2 - 多次运行:") for i in range(3): print(f"第{i+1}次: {np.random.choice(50, 5, replace=False)}") # 示例3：有放回 vs 无放回对比 print("\n示例3 - 有放回 vs 无放回:") print("无放回 (replace=False):", np.random.choice(10, 5, replace=False)) print("有放回 (replace=True):", np.random.choice(10, 5, replace=True))

示例1结果: [ 3 12 45 22 7] 结果类型: <class 'numpy.ndarray'>, 形状: (5,) 是否有重复: False 示例2 - 多次运行: 第1次: [28 49 13 2 34] 第2次: [17 40 8 32 44] 第3次: [ 9 21 38 0 27] 示例3 - 有放回 vs 无放回: 无放回 (replace=False): [7 1 9 5 3] 有放回 (replace=True): [8 1 8 4 8] # 注意8出现了3次！

重要限制和错误

# 错误示例1：要求的选择数超过可选范围 try: # 从10个元素中选择15个（无放回）会报错 result = np.random.choice(10, 15, replace=False) except ValueError as e: print(f"错误1: {e}") # 错误示例2：当replace=False时，p参数必须和为1 try: # 概率和不等于1会报错 result = np.random.choice(5, 3, replace=False, p=[0.1, 0.2, 0.1, 0.1, 0.1]) except ValueError as e: print(f"错误2: {e}")

场景1：随机分割数据集

def split_dataset(data_size, test_ratio=0.2): """随机划分训练集和测试集索引""" all_indices = np.arange(data_size) test_size = int(data_size * test_ratio) # 无放回随机选择测试集索引 test_indices = np.random.choice(data_size, test_size, replace=False) # 训练集索引 = 所有索引 - 测试集索引 train_indices = np.setdiff1d(all_indices, test_indices) return train_indices, test_indices # 使用示例 data_size = 1000 train_idx, test_idx = split_dataset(data_size, test_ratio=0.2) print(f"训练集大小: {len(train_idx)}, 测试集大小: {len(test_idx)}")

场景2：峰值检测应用

class PeakGenerator: def __init__(self, seq_length=1024): self.seq_length = seq_length def generate_random_peaks(self): """生成随机峰值位置""" # 随机决定峰值数量 (3-9个) num_peaks = np.random.randint(3, 10) # 从所有可能位置中随机选择不重复的峰值位置 peak_positions = np.random.choice(self.seq_length, num_peaks, replace=False) # 创建目标数组 target = np.zeros(self.seq_length) target[peak_positions] = 1.0 return peak_positions, target def visualize_peaks(self): """可视化峰值位置""" import matplotlib.pyplot as plt peak_positions, target = self.generate_random_peaks() plt.figure(figsize=(12, 4)) plt.plot(target, 'r-', label='峰值位置') plt.scatter(peak_positions, np.ones_like(peak_positions), c='blue', s=100, zorder=5, label='峰值点') plt.title(f'随机生成的 {len(peak_positions)} 个峰值位置') plt.xlabel('位置索引') plt.ylabel('幅度') plt.legend() plt.grid(True, alpha=0.3) plt.show() # 使用 generator = PeakGenerator(1024) generator.visualize_peaks()

场景3：随机抽样调查模拟

class RandomSurvey: def __init__(self, population_size=10000): self.population_size = population_size def conduct_survey(self, sample_size=1000, seed=None): """进行随机抽样调查""" if seed is not None: np.random.seed(seed) # 从总体中随机抽取样本（无放回） sampled_indices = np.random.choice( self.population_size, sample_size, replace=False ) # 模拟调查结果（这里用随机数据） survey_results = { 'indices': sampled_indices, 'ages': np.random.randint(18, 80, sample_size), 'responses': np.random.choice(['A', 'B', 'C'], sample_size, p=[0.4, 0.35, 0.25]) } return survey_results def analyze_survey(self, results): """分析调查结果""" print(f"抽样大小: {len(results['indices'])}") print(f"平均年龄: {results['ages'].mean():.1f}") # 统计选项分布 unique, counts = np.unique(results['responses'], return_counts=True) for resp, count in zip(unique, counts): percentage = count / len(results['responses']) * 100 print(f"选项 {resp}: {count} 人 ({percentage:.1f}%)") # 使用 survey = RandomSurvey(10000) results = survey.conduct_survey(sample_size=500, seed=42) survey.analyze_survey(results)

6. 概率分布控制

# 使用p参数控制选择概率 def weighted_random_selection(): """加权随机选择""" items = ['A', 'B', 'C', 'D', 'E'] probabilities = [0.1, 0.2, 0.3, 0.25, 0.15] # 必须和为1 # 有放回的加权随机选择 selected = np.random.choice( items, size=10, replace=True, p=probabilities ) print("加权随机选择结果:", selected) # 统计频率 unique, counts = np.unique(selected, return_counts=True) print("\n频率统计:") for item, count in zip(unique, counts): print(f"{item}: {count}次") weighted_random_selection()

7. 替代方案对比

import time def compare_methods(): """比较不同随机选择方法的性能""" n = 100000 k = 1000 # 方法1: np.random.choice start = time.time() result1 = np.random.choice(n, k, replace=False) time1 = time.time() - start # 方法2: np.random.permutation start = time.time() result2 = np.random.permutation(n)[:k] time2 = time.time() - start # 方法3: random.sample (Python内置) import random start = time.time() result3 = np.array(random.sample(range(n), k)) time3 = time.time() - start print(f"方法比较 (从{n}个中选择{k}个):") print(f"1. np.random.choice: {time1:.6f}秒") print(f"2. np.random.permutation: {time2:.6f}秒") print(f"3. random.sample: {time3:.6f}秒") compare_methods()

8. 高级应用：蒙特卡洛模拟

def monte_carlo_simulation(): """使用np.random.choice进行蒙特卡洛模拟""" # 模拟掷骰子 dice_faces = [1, 2, 3, 4, 5, 6] # 模拟10000次掷骰子 n_simulations = 10000 rolls = np.random.choice(dice_faces, n_simulations, replace=True) # 分析结果 unique, counts = np.unique(rolls, return_counts=True) print("蒙特卡洛模拟 - 掷骰子结果:") for face, count in zip(unique, counts): probability = count / n_simulations print(f"面{face}: {count}次, 概率: {probability:.4f}") # 理论概率 vs 模拟概率 theoretical_prob = 1/6 print(f"\n理论概率: {theoretical_prob:.4f}") print(f"最大偏差: {max(abs(counts/n_simulations - theoretical_prob)):.4f}") monte_carlo_simulation()

9. 可视化理解

def visualize_random_choice(): """可视化np.random.choice的工作原理""" import matplotlib.pyplot as plt # 参数设置 population = 20 sample_size = 5 # 创建图形 fig, axes = plt.subplots(2, 2, figsize=(12, 8)) # 子图1: 无放回抽样 ax1 = axes[0, 0] for i in range(10): # 重复10次 sample = np.random.choice(population, sample_size, replace=False) ax1.scatter(sample, [i]*sample_size, s=50, alpha=0.6) ax1.set_title('无放回抽样 (replace=False)') ax1.set_xlabel('元素索引') ax1.set_ylabel('实验次数') ax1.set_xlim(-1, population) ax1.grid(True, alpha=0.3) # 子图2: 有放回抽样 ax2 = axes[0, 1] for i in range(10): sample = np.random.choice(population, sample_size, replace=True) ax2.scatter(sample, [i]*sample_size, s=50, alpha=0.6) ax2.set_title('有放回抽样 (replace=True)') ax2.set_xlabel('元素索引') ax2.set_ylabel('实验次数') ax2.set_xlim(-1, population) ax2.grid(True, alpha=0.3) # 子图3: 抽样分布 ax3 = axes[1, 0] large_sample = np.random.choice(population, 1000, replace=True) ax3.hist(large_sample, bins=population, edgecolor='black', alpha=0.7) ax3.set_title('有放回抽样的分布 (1000次抽样)') ax3.set_xlabel('元素索引') ax3.set_ylabel('出现次数') ax3.grid(True, alpha=0.3) # 子图4: 概率分布 ax4 = axes[1, 1] weighted_probs = np.random.dirichlet(np.ones(population)) weighted_sample = np.random.choice(population, 1000, p=weighted_probs, replace=True) ax4.bar(range(population), weighted_probs, alpha=0.5, label='理论概率') ax4.hist(weighted_sample, bins=population, density=True, edgecolor='black', alpha=0.5, label='实际频率') ax4.set_title('加权随机选择') ax4.set_xlabel('元素索引') ax4.set_ylabel('概率/频率') ax4.legend() ax4.grid(True, alpha=0.3) plt.tight_layout() plt.show() visualize_random_choice()

10. 性能优化建议

def optimized_random_selection(): """优化随机选择性能的技巧""" # 1. 预先设置随机种子（用于可重复性） np.random.seed(42) # 2. 批量生成随机数（避免循环） print("方法1 - 批量生成:") n = 10000 batch_size = 1000 batch_results = np.random.choice(n, batch_size, replace=False) # 3. 使用整数数组而不是range print("\n方法2 - 使用整数数组:") arr = np.arange(n) selected = np.random.choice(arr, 100, replace=False) # 4. 对于大数据集，使用shuffle替代 print("\n方法3 - 使用shuffle:") large_array = np.arange(1000000) np.random.shuffle(large_array) selected = large_array[:1000] # 取前1000个 return selected optimized_random_selection()

总结：

np.random.choice(50, num_peaks, replace=False)意味着：
从0-49中选择num_peaks个不重复的整数
每个数字被选中的概率相等
返回一个NumPy数组
关键限制：
当replace=False时，size必须≤a的长度
概率数组p的和必须为1