Python:类别覆盖最小采样个数
import numpy as np
import scipy.io as scio
import pandas as pd
import matplotlib.pyplot as plt
from sklearn import datasets
from sklearn.datasets import fetch_mldata
from sklearn.datasets import fetch_20newsgroups_vectorized
def Cover_RandomSampling(y):
n = len(y)
labels = np.unique(y)
num_labels = len(labels)
Unobserved = [x for x in range(n)]
Selected_labels = []
for i in range(n):
temp = np.random.choice(Unobserved,replace=False)
Selected_labels.append(y[temp])
Unobserved.remove(temp)
ObservedLabels,ObservedCount = np.unique(Selected_labels,return_counts=True)
if len(ObservedLabels) == num_labels:
print(ObservedLabels,ObservedCount)
break
return len(Selected_labels)
if __name__ == "__main__":
# data = np.array(pd.read_csv(r'E:\dataset\clusterData\spiral_1.csv', header=None))
# data = np.array(pd.read_csv(r'E:\dataset\clusterData\bolbs_hard.csv', header=None))
# data = np.array(pd.read_csv(r'E:\dataset\clusterData\two_circles.csv', header=None))
# data = np.array(pd.read_csv(r'E:\dataset\clusterData\aggregation.csv', header=None))
# data,y = datasets.make_moons(n_samples=1000,shuffle=True,noise=0.1,random_state=101)
# plt.scatter(data[:,0],data[:,1],c=y,marker='o')
# plt.show()
# y = np.vstack(y)
# X = np.hstack((data,y))
# X = pd.DataFrame(X)
# X.to_csv(r'E:\dataset\clusterData\two_moons.csv',header=None,index=None)
# data = np.array(pd.read_csv(r'E:\dataset\clusterData\sonar.csv', header=None))
# data = np.array(pd.read_csv(r'E:\dataset\clusterData\proker_label.csv', header=None))
# y = data[:, -1]
# mnist = fetch_mldata('MNIST original')
# y = mnist['target']
# y = np.loadtxt(r'E:\dataset\clusterData\proker_label.csv',delimiter=',')
# twenty = fetch_20newsgroups_vectorized(subset='all')
# y = twenty.target
#--------------COIL20---------------------#
# path = r'E:\dataset\clusterData\label.mat'
# path1 = r'E:\dataset\clusterData\fea.mat '
# dataA = scio.loadmat(path)
# dataB = scio.loadmat(path1)
# X = dataB['fea']
# y = dataA['label']
# y = np.hstack(y)
#--------------------------------------#
# data = np.array(pd.read_csv(r'E:\dataset\clusterData\flame.csv', header=None))
# X = data[:, :-1]
# y = data[:, -1]
#--------------------------------------#
# data = np.array(pd.read_csv(r'E:\dataset\clusterData\COIL20_PCA1.csv', header=None))
# X = data[:, :-1]
# y = data[:, -1]
#--------------------------------------#
data = np.array(pd.read_csv(r'E:\dataset\clusterData\letterABC.csv', header=None))
X = data[:, :-1]
y = data[:, -1]
#################上面是数据##########################distPercent = 20
iterCount = []
for i in range(1000):
count = Cover_RandomSampling(y)
iterCount.append(count)
print(np.mean(iterCount))
print(np.std(iterCount,ddof=1))