1 Star 0 Fork 0

FindHao / data_mining

加入 Gitee
与超过 1200万 开发者一起发现、参与优秀开源项目,私有仓库也完全免费 :)
免费加入
克隆/下载
entry.py 13.22 KB
一键复制 编辑 原始数据 按行查看 历史
FindHao 提交于 2017-01-27 10:29 . 修正公式,初始值应该是P(h)
from numpy import median
import numpy as np
a = ["age", "type_employer", "fnlwgt", "education", "education_num", "marital", "occupation", "relationship", "race",
"sex", "capital_gain", "capital_loss", "hr_per_week", "country", "income"]
# 由于后面对数据进行了化简,所以需要将部分字段重新映射
tiny_map = {"Never-worked": "not-working", "Without-pay": "not-working", "Local-gov": "other-govt",
"State-gov": "other-govt", "Self-emp-inc": "self-employed", "Self-emp-not-inc": "self-employed",
"Craft-repair": "blue-collar", "Farming-fishing": "blue-collar", "Handlers-cleaners": "blue-collar",
"Machine-op-inspct": "blue-collar", "Transport-moving": "blue-collar", "Other-service": "service",
"Priv-house-serv": "service", "Cambodia": "SE-Asia", "Laos": "SE-Asia", "Philippines": "SE-Asia",
"Thailand": "SE-Asia", "Vietnam": "SE-Asia", "Canada": "British-Commonwealth",
"England": "British-Commonwealth", "India": "British-Commonwealth", "Ireland": "British-Commonwealth",
"Scotland": "British-Commonwealth", "China": "China", "Hong": "China", "Taiwan": "China",
"Columbia": "South-America", "Ecuador": "South-America", "El-Salvador": "South-America",
"Peru": "South-America", "Cuba": "other", "Iran": "other", "Japan": "other",
"Dominican-Republic": "Latin-America", "Guatemala": "Latin-America", "Haiti": "Latin-America",
"Honduras": "Latin-America", "Jamaica": "Latin-America", "Mexico": "Latin-America",
"Nicaragua": "Latin-America", "Outlying-US(Guam-USVI-etc)": "Latin-America", "Puerto-Rico": "Latin-America",
"Trinadad&Tobago": "Latin-America", "France": "Euro_1", "Germany": "Euro_1", "Holand-Netherlands": "Euro_1",
"Italy": "Euro_1", "Greece": "Euro_2", "Hungary": "Euro_2", "Poland": "Euro_2", "Portugal": "Euro_2",
"Yugoslavia": "Euro_2", "10th": "dropout", "11th": "dropout", "12th": "dropout", "1st-4th": "dropout",
"5th-6th": "dropout", "7th-8th": "dropout", "9th": "dropout", "Preschool": "dropout", "Assoc-acdm": "Assoc",
"Assoc-voc": "Assoc", "Married-AF-spouse": "Married", "Married-civ-spouse": "Married",
"Married-spouse-absent": "not-married", "Separated": "not-married", "Divorced": "not-married", }
hour_map = {"1": "10s", "2": "10s", "3": "10s", "4": "10s", "5": "10s", "6": "10s", "7": "10s", "8": "10s", "9": "10s", "10": "10s", "11": "20s", "12": "20s", "13": "20s", "14": "20s", "15": "20s", "16": "20s", "17": "20s", "18": "20s", "19": "20s", "20": "20s", "21": "30s", "22": "30s", "23": "30s", "24": "30s", "25": "30s", "26": "30s", "27": "30s", "28": "30s", "29": "30s", "30": "30s", "31": "40s", "32": "40s", "33": "40s", "34": "40s", "35": "40s", "36": "40s", "37": "40s", "38": "40s", "39": "40s", "40": "40s", "41": "50s", "42": "50s", "43": "50s", "44": "50s", "45": "50s", "46": "50s", "47": "50s", "48": "50s", "49": "50s", "50": "50s", "51": "60s", "52": "60s", "53": "60s", "54": "60s", "55": "60s", "56": "60s", "57": "60s", "58": "60s", "59": "60s", "60": "60s", "61": "70s", "62": "70s", "63": "70s", "64": "70s", "65": "70s", "66": "70s", "67": "70s", "68": "70s", "69": "70s", "70": "70s", "71": "80s", "72": "80s", "73": "80s", "74": "80s", "75": "80s", "76": "80s", "77": "80s", "78": "80s", "79": "80s", "80": "80s", "81": "90s", "82": "90s", "83": "90s", "84": "90s", "85": "90s", "86": "90s", "87": "90s", "88": "90s", "89": "90s", "90": "90s", "91": "100s", "92": "100s", "93": "100s", "94": "100s", "95": "100s", "96": "100s", "97": "100s", "98": "100s", "99": "100s", "100": "100s", }
age_map = {"1": "5s", "2": "5s", "3": "5s", "4": "5s", "5": "5s", "6": "10s", "7": "10s", "8": "10s", "9": "10s", "10": "10s", "11": "15s", "12": "15s", "13": "15s", "14": "15s", "15": "15s", "16": "20s", "17": "20s", "18": "20s", "19": "20s", "20": "20s", "21": "25s", "22": "25s", "23": "25s", "24": "25s", "25": "25s", "26": "30s", "27": "30s", "28": "30s", "29": "30s", "30": "30s", "31": "35s", "32": "35s", "33": "35s", "34": "35s", "35": "35s", "36": "40s", "37": "40s", "38": "40s", "39": "40s", "40": "40s", "41": "45s", "42": "45s", "43": "45s", "44": "45s", "45": "45s", "46": "50s", "47": "50s", "48": "50s", "49": "50s", "50": "50s", "51": "55s", "52": "55s", "53": "55s", "54": "55s", "55": "55s", "56": "60s", "57": "60s", "58": "60s", "59": "60s", "60": "60s", "61": "65s", "62": "65s", "63": "65s", "64": "65s", "65": "65s", "66": "70s", "67": "70s", "68": "70s", "69": "70s", "70": "70s", "71": "75s", "72": "75s", "73": "75s", "74": "75s", "75": "75s", "76": "80s", "77": "80s", "78": "80s", "79": "80s", "80": "80s", "81": "85s", "82": "85s", "83": "85s", "84": "85s", "85": "85s", "86": "90s", "87": "90s", "88": "90s", "89": "90s", "90": "90s", "91": "95s", "92": "95s", "93": "95s", "94": "95s", "95": "95s", "96": "100s", "97": "100s", "98": "100s", "99": "100s", "100": "100s", }
class DataSet:
def __init__(self):
self.data = []
self.loss_mid = 0
self.gain_mid = 0
self.hours_mid = 0
self.age_mid = 0
self.classfied_dataset = None
self.len_data = 0
dataset_low = DataSet()
dataset_high = DataSet()
with open('adult.data', 'r')as fin:
line = fin.readline()
gain_high = []
loss_high = []
gain_low = []
loss_low = []
hours_low = []
hours_high = []
age_low = []
age_high = []
while line:
line = line.replace('\n', '')
if line:
line = line.split(", ")
if line[len(line) - 1] == '>50K':
dataset_high.data.append(line)
gain_high.append(int(line[10]))
loss_high.append(int(line[11]))
age_high.append(int(line[0]))
hours_high.append(int(line[12]))
else:
dataset_low.data.append(line)
gain_low.append(int(line[10]))
loss_low.append(int(line[11]))
age_low.append(int(line[0]))
hours_low.append(int(line[12]))
line = fin.readline()
# 获取部分中位数
dataset_low.loss_mid = median(np.array(loss_low))
dataset_low.gain_mid = median(np.array(gain_low))
dataset_low.age_mid = median(np.array(age_low))
dataset_low.hours_mid = median(np.array(hours_low))
dataset_low.len_data = len(dataset_low.data)
dataset_high.loss_mid = median(np.array(loss_high))
dataset_high.gain_mid = median(np.array(gain_high))
dataset_high.age_mid = median(np.array(age_high))
dataset_high.hours_mid = median(np.array(hours_high))
dataset_high.len_data = len(dataset_high.data)
"""分类以后的处理
classfiled_data['age'] = {type: num;}
classfiled_data = {'age':{type: num...}}
"""
# @todo: 年龄还需要分类,还有工作时长
# hour也按照中位数来处理吧。还有年龄也是。
def classfy(gain, loss, data):
classfiled_data = {}
loss_median = loss
gain_median = gain
for node in a:
classfiled_data[node] = {}
for line in data:
if len(line) < 10:
continue
for node in a:
if line[a.index(node)] in classfiled_data[node]:
classfiled_data[node][line[a.index(node)]] += 1
else:
classfiled_data[node][line[a.index(node)]] = 1
def tiny(a_list, category, new_name):
if new_name not in classfiled_data[category]:
classfiled_data[category][new_name] = 0
for key in list(classfiled_data[category]):
if key in a_list and key != new_name:
classfiled_data[category][new_name] += classfiled_data[category][key]
del classfiled_data[category][key]
def income_classfy(category, mid_value):
"""对收入进行分类,简化成none, low, high三个级别"""
classfiled_data[category]['low'] = 0
classfiled_data[category]['none'] = 0
classfiled_data[category]['high'] = 0
for key in list(classfiled_data[category]):
if key in ['low', 'none', 'high']:
continue
if int(key) <= 0:
classfiled_data[category]['none'] += classfiled_data[category][key]
elif int(key) < mid_value:
classfiled_data[category]['low'] += classfiled_data[category][key]
else:
classfiled_data[category]['high'] += classfiled_data[category][key]
del classfiled_data[category][key]
# 1 删除两个多余的元素
del classfiled_data['education_num']
del classfiled_data['fnlwgt']
tiny(['Never-worked', 'Without-pay'], 'type_employer', 'not-working')
tiny(['Local-gov', 'State-gov'], 'type_employer', 'other-govt')
tiny(['Self-emp-inc', 'Self-emp-not-inc'], 'type_employer', 'self-employed')
# 职业上的合并
tiny(["Craft-repair", "Farming-fishing", "Handlers-cleaners", "Machine-op-inspct", "Transport-moving"],
'occupation',
'blue-collar')
tiny(['Other-service', 'Priv-house-serv'], 'occupation', 'service')
tiny(["Cambodia", "Laos", "Philippines", "Thailand", "Vietnam"], 'country', 'SE-Asia')
tiny(["Canada", "England", "India", "Ireland", "Scotland", ], 'country', 'British-Commonwealth')
tiny(['China', 'Hong', 'Taiwan'], 'country', 'China')
tiny(["Columbia", "Ecuador", "El-Salvador", "Peru"], 'country', 'South-America')
tiny(["Cuba", "Iran", "Japan"], 'country', 'other')
tiny(["Dominican-Republic", "Guatemala", "Haiti", "Honduras", "Jamaica", "Mexico", "Nicaragua",
"Outlying-US(Guam-USVI-etc)", "Puerto-Rico", "Trinadad&Tobago", ], 'country', 'Latin-America')
tiny(["France", "Germany", "Holand-Netherlands", "Italy", ], 'country', 'Euro_1')
tiny(["Greece", "Hungary", "Poland", "Portugal", "Yugoslavia", ], 'country', 'Euro_2')
tiny(["10th", "11th", "12th", "1st-4th", "5th-6th", "7th-8th", "9th", "Preschool", ], 'education', 'dropout')
tiny(['Assoc-acdm', 'Assoc-voc'], 'education', 'Assoc')
tiny(["Married-AF-spouse", "Married-civ-spouse"], 'marital', "Married")
tiny(["Married-spouse-absent", "Separated", "Divorced"], 'marital', 'not-married')
# 一个不好的点是,用中位数切割以后,数据分散在了两端,中间没有数据
income_classfy('capital_gain', gain_median)
income_classfy('capital_loss', loss_median)
# 数据都是( ] 取上不取下
# 工作时间划分直接分10块吧,1-10,11-20...100,相应映射到10s,20s,30,
for x in range(10):
a_set = []
for y in range(10 * (x + 1)):
a_set.append(str(y + 1))
tiny(a_set, 'hr_per_week', str(10 * (x + 1)) + 's')
# 年龄以5划分1-5:5s
for x in range(20):
a_set = []
for y in range(5 * (x + 1)):
a_set.append(str(y + 1))
tiny(a_set, 'age', str(5 * (x + 1)) + 's')
for key in classfiled_data:
print(key)
print(classfiled_data[key])
return classfiled_data
dataset_low.classfied_dataset = classfy(dataset_low.gain_mid, dataset_low.loss_mid, dataset_low.data)
dataset_high.classfied_dataset = classfy(dataset_high.gain_mid, dataset_high.loss_mid, dataset_high.data)
def test(line):
"""测试数据"""
# P(每个元素|收入大于50k)
p_low = dataset_low.len_data / (dataset_high.len_data + dataset_low.len_data)
p_high = dataset_high.len_data / (dataset_high.len_data + dataset_low.len_data)
gain = int(line[-5])
loss = int(line[-4])
def get_level(value, mid_value):
if value <= 0:
return 'none'
elif value < mid_value:
return 'low'
else:
return 'high'
for node in a[:-1]:
i = a.index(node)
if node in ["fnlwgt", "education_num"]: continue
if line[i] in tiny_map:
line[i] = tiny_map[line[i]]
# 年龄
if i == 0:
line[i] = age_map[line[i]]
# 工作时长
if i == 12:
line[i] = hour_map[line[i]]
# 收入部分
if node == 'capital_gain':
line[i] = get_level(gain, dataset_low.gain_mid)
if node == 'capital_loss':
line[i] = get_level(loss, dataset_low.loss_mid)
p_low *= dataset_low.classfied_dataset[node][line[i]] / dataset_low.len_data
# 收入部分
if node == 'capital_gain':
line[i] = get_level(gain, dataset_high.gain_mid)
if node == 'capital_loss':
line[i] = get_level(loss, dataset_high.loss_mid)
p_high *= dataset_high.classfied_dataset[node][line[i]] / dataset_high.len_data
# print(p_low, p_high)
if p_low > p_high:
return '<=50k'
else:
return '>50k'
print("训练数据的总数:\n >50k\t%d\n<=50k\t%d" % (len(dataset_high.data), len(dataset_low.data)))
with open('adult.test', 'r')as fin:
line = fin.readline()
right = 0
wrong = 0
while line:
# 去掉无效行
if len(line) < 25:
line = fin.readline()
continue
line = line.replace('\n', '')
line = line[:-1]
line = line.split(', ')
ans = test(line).upper()
if line[-1] == ans:
right += 1
else:
wrong += 1
# print("%s%s" %(line[-1], ans))
line = fin.readline()
print("模型的判断正确的次数:\t%d\n错误的次数\t%d\n正确率:\t%f" % (right, wrong, (right / (right + wrong))))
Python
1
https://gitee.com/findspace/data_mining.git
git@gitee.com:findspace/data_mining.git
findspace
data_mining
data_mining
master

搜索帮助