#------------------------------------------------------------------------------- print('ProBtWindows\Examples\chapter2\PCM_spam.py, lastchanges 2014/03/01') #------------------------------------------------------------------------------- import os.path # from pyplpath and pypl import all from pyplpath import * from pypl import * #------------------------------------------------------------------------------- print ' ' print '=============================== Data ==============================' nf = 250 print 'nf = #(non-spam-emails) ==>', nf nt = 750 print 'nt = #(spam-emails) ==>', nt n = nf + nt print 'n = #(emails) ==>', n # list of relevant word labels in emails: [..., word(i) ,...] lbs = ["fortune", "next", "programming", "money", "you"] print 'lbs = [...,word(i) ,...] ==>' print ' ', lbs #define the number of words m m = len(lbs) print 'm = #words ==>', m # conditional frequencies #(word(i)|spam=0,1) of Table 2.1 # list of frequencies words in non-spam emails: [..., #(word(i)|spam=0),...] nfi = [0, 125, 250, 0, 125] lnf = len(nfi) print 'nfi = [..., #(word(i)|spam=0) ,...] ==>', nfi # list of frequencies words in spam emails: [..., #(word(i)|spam=1),...] nti = [375, 0 , 0, 750, 375] lnt = len(nti) print 'nti = [..., #(word(i)|spam=1) ,...] ==>', nti if not (m == lnf and m == lnt): print '!!! different lengths of lists !!!' # unconditional frequencies #(word(i)|spam=*) = #(word(i)) ni = list(map(lambda x,y : x + y, nfi, nti)) print 'ni = [..., #(word(i)|spam=*) ,...] ==>', ni print ' ' print '============================== Types ==============================' # define a binary type # plIntegerType (int min, int max) # creates an integer type with interval [min,max]. binary_type = plIntegerType(0, 1) print 'binary_type = plIntegerType(0, 1) ==>', binary_type print ' ' print '============================ Variables ============================' # define the binary classification variable 'Spam' # plSymbol (const std::string &print_name, const plType &variable_type) # Constructor from a print_name as a string, selected by the user, # and a previously defined variable_type. Spam = plSymbol("Spam",binary_type) print 'Spam = plSymbol("Spam",binary_type) ==>', Spam # define m binary variables # plArray (const std::string &print_name, # const plType &variable_type, int tab_dim,...) # The array named print_name contains variables of type variable_type, # has dimensions tab_dim. # The other parameters first_dimension,... are the sizes for each dimension. tab_dim = 1 print 'tab_dim =', tab_dim W = plArray("W",binary_type,tab_dim,m) print 'W = plArray("W",binary_type,tab_dim,m) ==>', W print ' ' print '============= Parametric Forms and (De-)Composition ===============' # define the local (C)PDs of the JPD and push them into the JPD-List # A plComputableObjectList is especially used to concatenate a set of # plComputableObjects (to be used when creating a plJointDistribution # for example). JointDistributionList = plComputableObjectList() print 'JointDistributionList = plComputableObjectList() ==>',JointDistributionList print ' ' print '--------------------------- P_Spam --------------------------------' # define a prior distribution probability on binary variable 'Spam' # plProbTable (const plVariablesConjunction &V, # const std::vector< T > &values, # bool is_already_normalized=false) # Constructor 3: Constructs a probability table on the variable(s) V # and fills it using the values contained in the STL vector values. P_Spam = plProbTable(Spam,[nf,nt]) print 'P_Spam = plProbTable(Spam,[nf,nt]) ==>', P_Spam # void push_back (const plComputableObject &o) # Insert a new element at the end of the list. JointDistributionList.push_back(P_Spam) print 'JointDistributionList.push_back(P_Spam) ==>',JointDistributionList print ' ' print '--------------------- m CPDs P(W(i) | Spam) -----------------------' # define the m CPDs P(W(i) | Spam) of Table 2.2 for i in range(m): # A plDistributionTable is a way to define a conditional distribution # from a set of Computable Objects (Distributions and/or Conditional # having the same left variables than the building blocks. # plDistributionTable (const plVariablesConjunction &left, # const plVariablesConjunction &right) # A plDistributionTable indexed by the set of variables right. P_Wi_K_Spam = plDistributionTable(W[i],Spam) print ' ' print '----- row', i, 'of table 2.2 -----' print ' ' print 'Word', i, ': "',lbs[i], '"' print 'P_Wi_K_Spam = P(W[i]|Spam) ==>', P_Wi_K_Spam # define the two distributions on Wi # plProbTable (const plVariablesConjunction &V, # const std::vector< T > &values, # bool is_already_normalized=false) # Constructor 3: Constructs a probability table on the variable(s) V # and fills it using the values contained in the STL vector values. # one for Spam = 0 print 'nfi[i] =', nfi[i], ' nf =', nf P_Wi1_K_Spam0 = float(1+nfi[i])/float(2+nf) P_Wi0_K_Spam0 = 1 - P_Wi1_K_Spam0 P_Wi_K_Spam0 = plProbTable(W[i],[P_Wi0_K_Spam0, P_Wi1_K_Spam0]) print 'P_Wi_K_Spam0 = plProbTable(W[i],[P_Wi0_K_Spam0, P_Wi1_K_Spam0]) ==>',P_Wi_K_Spam0 #the other for Spam = 1 print 'nti[i] =', nti[i], ' nt =', nt P_Wi1_K_Spam1 = float(1+nti[i])/float(2+nt) P_Wi0_K_Spam1 = 1 - P_Wi1_K_Spam1 P_Wi_K_Spam1 = plProbTable(W[i],[P_Wi0_K_Spam1, P_Wi1_K_Spam1]) print 'P_Wi_K_Spam1 = plProbTable(W[i],[P_Wi0_K_Spam1, P_Wi1_K_Spam1]) ==>',P_Wi_K_Spam1 # # void push (const plComputableObject &compObj, int value) # Inserts a new distribution compObj with an specified integer key value value. P_Wi_K_Spam.push(P_Wi_K_Spam0, 0) P_Wi_K_Spam.push(P_Wi_K_Spam1 ,1) print ' ' print 'P_Wi_K_Spam ==>', P_Wi_K_Spam # and store it in a distribution list JointDistributionList.push_back(P_Wi_K_Spam) print ' ' print '----------- (De-)Composition: JPD-List, JPD = Model ---------------' print 'JointDistributionList ==>', JointDistributionList # define the model # The plJointDistribution class is used to describe a probabilistic model # by providing a decomposition of the joint distribution of the whole model # variables as a product of elementary conditional and non conditional # distributions. # plJointDistribution(const plVariablesConjunction &all_params, # const plComputableObjectList &decomposition) # creates a joint distribution (probabilistic model) as a product of # distributions. print ' ' model = plJointDistribution(Spam^W, JointDistributionList) print 'model = plJointDistribution(Spam^W, JointDistributionList) ==>', model print ' ' # model.draw_graph(os.path.join(ExDir, "chapter2", "data", "spam_graph")) model_graph = model.draw_graph_dot(os.path.join(ExDir, "chapter2", "figures", "spam_graph.dot")) print 'model_graph = model.draw_graph_dot(os.path.join(ExDir, "chapter2", "figures", "spam_graph.dot")) ==>', model_graph print ' ' print '============================= Question ============================' # define the question # plCndDistribution ask(const plVariablesConjunction &search_params, # const plVariablesConjunction &known_params, # plOptimizationCriterion optimization=PL_NO_OPTIMIZATION, # bool do_not_build_normalization_expression=false) const # returns CPD P(Spam|W) question_CPD = model.ask(Spam,W) print 'formula (2.32) on p.30' print 'question_CPD = model.ask(Spam,W) ==>', question_CPD # # plValues (const plVariablesConjunction &variables) # Create a set of plValues allowing to store the values of the variable # conjunction variable and initialize its value to the first value of # the conjunction. vals_of_W = plValues(W) print ' ' print 'There will be 2**|W| = 2**4 = 32 assignments of values to W' print 'vals_of_W = plValues(W) ==>', vals_of_W print ' ' print '============================= Answers =============================' #use all the possible values to build table 2.3 j = 0 for val_i in vals_of_W: j+=1 i = val_i print 'assignment', j,' of W ==>', i # plDistribution instantiate(const plValues &values, # bool ensure_normalization_on_compute=true) const # Produces a new distribution by instantiating the known variables # given by a the plValues values. print 'question_CPD.instantiate(i) ==>', question_CPD.instantiate(i) # plDistribution plDistribution::compile() const # Compiles the distribution using an exhaustive generator # (PL_EXHAUSTIVE_GENERATOR) (i.e. by generating all points of the discrete # or discretized variables space) and stores the result as a table (PL_TABLE). # Note that this method returns a freshly created object. # If you are calling it in a loop, consider using the version that modifies # an existing object instead print 'question_CPD.instantiate(i).compile() ==>', question_CPD.instantiate(i).compile() print ' ' print 'question ==>', question_CPD print ' ' print '======================== Special Question =========================' # what is the probability distribution for a mail containing: # "next" "programming" and "you" ? evidence = [0, 1, 1, 0, 1] print 'assignment 14 ==>', evidence my_question = "P(Spam | " for k in range(len(evidence)): if evidence[k] == 1: my_question = my_question + lbs[k] + " " my_question = my_question + ")" w = evidence print ' ' print 'my_question =', my_question, "==> " print 'question_CPD.instantiate(w).compile() ==>', question_CPD.instantiate(w).compile() print '=============================== End =============================='