Done analysis.

This commit is contained in:
Daniel Lemire 2018-12-24 15:10:55 -05:00
parent 95d1dc24f6
commit 579bc0d848
4 changed files with 93 additions and 2 deletions

View File

@ -149,8 +149,8 @@ int main(int argc, char *argv[]) {
"false_count byte_count structural_indexes_count ");
#ifdef __linux__
printf(
" stage1_instruction_count stage1_cycle_count stage2_instruction_count "
"stage2_cycle_count stage3_instruction_count stage3_cycle_count ");
" stage1_cycle_count stage1_instruction_count stage2_cycle_count "
" stage2_instruction_count stage3_cycle_count stage3_instruction_count ");
#else
printf("(you are not under linux, so perf counters are disaabled)");
#endif

View File

@ -0,0 +1,28 @@
loading modeltable.txt
chosenpredictors= ['integer_count', 'float_count', 'string_count', 'backslash_count', 'nonasciibyte_count', 'object_count', 'array_count', 'null_count', 'true_count', 'false_count', 'byte_count', 'structural_indexes_count']
target = stage1_cycle_count
0.98 cycles per nonasciibyte_count
0.46 cycles per byte_count
R2 = 0.9987789976254756
target = stage2_cycle_count
2 cycles per structural_indexes_count
0.11 cycles per byte_count
R2 = 0.9944144115615688
target = stage3_cycle_count
2.5 cycles per float_count
2.2 cycles per string_count
1.4 cycles per structural_indexes_count
0.095 cycles per byte_count
R2 = 0.9983269621964512
target = total_cycles
6.6 cycles per string_count
6 cycles per float_count
2.6 cycles per structural_indexes_count
0.82 cycles per nonasciibyte_count
0.64 cycles per byte_count
R2 = 0.9994265124423222

View File

@ -0,0 +1,62 @@
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import Ridge
from sklearn.linear_model import Lasso
from sklearn.preprocessing import normalize
from sklearn import metrics
def displaycoefs(coef_name):
coef_name.sort()
coef_name.reverse()
for c,n in coef_name:
print("\t%0.2g cycles per %s "%(c,n))
datafile = "modeltable.txt" ## from ./scripts/statisticalmodel.sh
predictors = ["integer_count", "float_count", "string_count", "backslash_count", "nonasciibyte_count", "object_count", "array_count", "null_count", "true_count", "false_count", "byte_count", "structural_indexes_count"]
targets = ["stage1_cycle_count", "stage1_instruction_count", "stage2_cycle_count", "stage2_instruction_count", "stage3_cycle_count", "stage3_instruction_count"]
print("loading", datafile)
dataset = pd.read_csv(datafile, delim_whitespace=True, skip_blank_lines=True, comment="#", header=None, names = predictors + targets)
dataset.columns = predictors + targets
dataset['total_cycles']=dataset['stage1_cycle_count']+dataset['stage2_cycle_count']+dataset['stage3_cycle_count']
dataset['ratio']=dataset['total_cycles']/dataset['byte_count']
#print(dataset[['ratio']])
chosenpredictors = predictors #["integer_count", "float_count", "string_count", "backslash_count", "nonasciibyte_count", "byte_count", "structural_indexes_count"]
print("chosenpredictors=",chosenpredictors)
print()
chosentargets=["stage1_cycle_count", "stage2_cycle_count", "stage3_cycle_count","total_cycles"]
for t in chosentargets:
print("target = ", t)
howmany = 2 # we want at most two predictors
if(t.startswith("stage2")):
howmany = 2 # we allow for less
if(t.startswith("stage3")):
howmany = 4 # we allow for more
if(t.startswith("total")):
howmany = 5 # we allow for more
A=10000000.0
while(True):
regressor = Lasso(max_iter=100000, alpha=A, positive = True, normalize=False, fit_intercept=False) #LinearRegression(normalize=False, fit_intercept=False)
x = dataset[chosenpredictors]
y = dataset[[t]]
regressor.fit(x, y)
rest = list(filter(lambda z: z[0] != 0, zip(regressor.coef_,chosenpredictors) ))
nonzero = len(rest)
if(nonzero > howmany):
A *= 1.2
else:
#print(rest)
displaycoefs(rest)
print("R2 = ", regressor.score(x,y))
Y_pred = regressor.predict(x)
break
print()

View File

@ -0,0 +1 @@
python learn.py > analysis.txt