- dataset.py :Olej писал(а):Дальше реализую модели классификации на этих (некоторых) наборах данных...
Код: Выделить всё
# -*- coding: utf-8 -*-
try:
import urllib.request as url
except ImportError:
import urllib as url
import os
import subprocess
from sklearn import metrics, neighbors
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
def version():
return 'version 0.05'
#-----------------------------------------------------------------------
def gunzip( response, zname, step = 100000 ):
fw = os.open( zname, os.O_WRONLY | os.O_TRUNC | os.O_CREAT )
data = response.read()
os.write( fw, data )
os.close( fw )
proc = subprocess.Popen( [ 'gunzip', '-kfq', zname ] )
dname = zname[ :zname.rfind( '.' ) ]
fr = os.open( dname, os.O_RDONLY )
print( '{} => {}'.format( zname, dname ) )
data = ''
while True:
d = os.read( fr, step )
data += d.decode( 'utf-8' )
if len( d ) < step: break
os.close( fr )
return ( data.splitlines() )
#-----------------------------------------------------------------------
def loadtxt( dataset, delimiter = ',', help = True ): # load selected dataset
def get( str ):
try:
return float( str )
except Exception:
return 0.0
if help:
print( 'dataset URL: {}'.format( dataset[ 0 ] ) )
try:
response = url.urlopen( dataset[ 0 ] ) # download from URL
except IOError as err:
print( 'wrong URL: {}'.format( err ) )
sys.exit( 1 )
fname = dataset[ 0 ].split( os.sep )[ -1 ]
if 'Z' == fname.split( '.' )[ -1 ]: # gzip archive
data = gunzip( response, fname )
else:
data = response.read().decode( 'utf-8' ).splitlines()
arg = []; fun = []
for line in data:
lst = list( map( get, line.split( delimiter ) ) )
arg.append( lst[ dataset[ 2 ] : dataset[ 3 ] + 1 ] )
fun.append( lst[ dataset[ 1 ] ] )
return ( arg, fun )
#-----------------------------------------------------------------------
class model: # classification models
title = ( 'K Neighbors Classifier' ,
'Logistic Regression' ,
'Gaussian NB' ,
'Decision Tree Classifier',
'Support Vector Machines',
'RandomForest' )
funct = ( neighbors.KNeighborsClassifier( n_neighbors = 5, weights = 'distance' ),
LogisticRegression( solver = 'lbfgs', multi_class = 'auto', max_iter = 3000 ),
GaussianNB(),
DecisionTreeClassifier(),
SVC( gamma = 'auto' ),
RandomForestClassifier( n_estimators = 100 )
)
#-----------------------------------------------------------------------
dbase = ( # url-s with dataset
( 'http://archive.ics.uci.edu/ml/machine-learning-databases/glass/glass.data',
10, 1, 9 ),
( 'http://archive.ics.uci.edu/ml/machine-learning-databases/zoo/zoo.data',
17, 1, 16 ),
( 'http://archive.ics.uci.edu/ml/machine-learning-databases/wine/wine.data',
0, 1, 14 ),
( 'http://archive.ics.uci.edu/ml/machine-learning-databases/spambase/spambase.data',
57, 0, 56 ),
( 'http://archive.ics.uci.edu/ml/machine-learning-databases/waveform/waveform.data.Z',
21, 0, 20 )
)
def get_data( i ):
return dbase[ i ]
#-----------------------------------------------------------------------
Код: Выделить всё
#!/usr/bin/python2
# -*- coding: utf-8 -*-
import os, sys, argparse
from dataset import *
parser = argparse.ArgumentParser()
parser.add_argument( '-b', '--base', required = False, type = int,
default = 0, help = 'data base number' )
parser.add_argument( '-m', '--model', required = False,
default = '0', help = 'select model numbers' )
parser.add_argument( '-v', '--verbose', action = 'count',
default = 0, help = 'increase output verbosity' )
args = vars( parser.parse_args() )
debug = int( args[ 'verbose' ] ) # verbose level
print( version() )
if debug > 0:
print( args )
X, y = loadtxt( get_data( args[ 'base' ] ) ) # load selected dataset
if debug > 0:
print( y )
if debug > 1:
print( X )
print( '{} instances with {} dimensions into {} classes: {}'.
format( len( X ), len( X[ 0 ] ), len( set( y ) ), list( set( y ) ) ) )
#-----------------------------------------------------------------------
def model_list( str ):
mpos = str.split( ',' )
for sn in mpos:
def add( i ):
global mlist
mlist[ i ] = model.funct[ i ]
n = sn.find( '-' )
if -1 == n:
try: add( int( sn ) )
except Exception: pass
else:
print( n, sn[ :n ], sn[ n+1: ] )
try:
for m in range( int( sn[ :n ] ), int( sn[ n+1: ] ) + 1 ):
add( m )
except Exception: pass
#-----------------------------------------------------------------------
mlist = [ None for i in range( len( model.title ) ) ] # select models
model_list( args[ 'model' ] )
if debug > 0:
for m in mlist:
print( 'model: {}'.format( m ) )
for i in range( len( mlist ) ): # apply models
if None == mlist[ i ]: continue
print( model.title[ i ] )
fmodel = mlist[ i ]
fmodel.fit( X, y )
#print(model)
expected = y # make predictions
predicted = fmodel.predict( X )
# summarize the fit of the model
print( metrics.classification_report( expected, predicted ) )
print( metrics.confusion_matrix( expected, predicted ) )