#-------------------------------------------------------------------------------
print('ProBtWindows\Examples\chapter2\PCM_spam.py, lastchanges 2014/03/01')  
#-------------------------------------------------------------------------------
import os.path
# from pyplpath and pypl import all                               
from pyplpath import * 
from pypl import *
#-------------------------------------------------------------------------------
print ' '
print '=============================== Data =============================='
nf = 250
print 'nf     = #(non-spam-emails) ==>', nf
nt = 750
print 'nt     = #(spam-emails)     ==>', nt
n = nf + nt
print 'n      = #(emails)          ==>', n

# list of relevant word labels in emails: [..., word(i) ,...]
lbs = ["fortune", "next", "programming", "money", "you"]
print 'lbs    = [...,word(i) ,...] ==>'
print '        ', lbs
#define the number of words m
m = len(lbs)
print 'm      = #words             ==>', m

# conditional frequencies #(word(i)|spam=0,1) of Table 2.1
# list of frequencies words in non-spam emails: [..., #(word(i)|spam=0),...]
nfi = [0, 125, 250, 0, 125]
lnf = len(nfi)
print 'nfi    = [..., #(word(i)|spam=0) ,...] ==>', nfi
# list of frequencies words in spam emails: [..., #(word(i)|spam=1),...]
nti = [375, 0 , 0, 750, 375]
lnt = len(nti)
print 'nti    = [..., #(word(i)|spam=1) ,...] ==>', nti

if not (m == lnf and m == lnt):
    print '!!! different lengths of lists !!!'
# unconditional frequencies #(word(i)|spam=*) = #(word(i))
ni = list(map(lambda x,y : x + y, nfi, nti))
print 'ni     = [..., #(word(i)|spam=*) ,...] ==>', ni
print ' '
print '============================== Types =============================='
# define a binary type
#   plIntegerType (int min, int max)
#   creates an integer type with interval [min,max]. 
binary_type = plIntegerType(0, 1)
print 'binary_type = plIntegerType(0, 1)      ==>', binary_type
print ' '
print '============================ Variables ============================'
# define the binary classification variable 'Spam'
#   plSymbol (const std::string &print_name, const plType &variable_type)
#   Constructor from a print_name as a string, selected by the user,
#   and a previously defined variable_type.
Spam = plSymbol("Spam",binary_type)
print 'Spam   = plSymbol("Spam",binary_type)  ==>', Spam

# define m binary variables
#   plArray (const std::string &print_name,
#            const plType &variable_type, int tab_dim,...)
#   The array named print_name contains variables of type variable_type,
#   has dimensions tab_dim.
#   The other parameters first_dimension,... are the sizes for each dimension.
tab_dim = 1
print 'tab_dim =', tab_dim
W = plArray("W",binary_type,tab_dim,m)
print 'W = plArray("W",binary_type,tab_dim,m) ==>', W
print ' '
print '============= Parametric Forms and (De-)Composition ==============='
# define the local (C)PDs of the JPD and push them into the JPD-List
#   A plComputableObjectList is especially used to concatenate a set of
#   plComputableObjects (to be used when creating a plJointDistribution
#   for example). 
JointDistributionList = plComputableObjectList()
print 'JointDistributionList = plComputableObjectList() ==>',JointDistributionList
print ' '
print '--------------------------- P_Spam --------------------------------'
# define a prior distribution probability on binary variable 'Spam'
#   plProbTable (const plVariablesConjunction &V,
#                const std::vector< T > &values,
#                bool is_already_normalized=false)
#   Constructor 3: Constructs a probability table on the variable(s) V
#   and fills it using the values contained in the STL vector values. 
P_Spam = plProbTable(Spam,[nf,nt])
print 'P_Spam = plProbTable(Spam,[nf,nt])     ==>', P_Spam
#   void    push_back (const plComputableObject &o)
#   Insert a new element at the end of the list. 
JointDistributionList.push_back(P_Spam)
print 'JointDistributionList.push_back(P_Spam) ==>',JointDistributionList
print ' '
print '--------------------- m CPDs P(W(i) | Spam) -----------------------'
# define the m CPDs P(W(i) | Spam) of Table 2.2 
for i in range(m):
  # A plDistributionTable is a way to define a conditional distribution
  #   from a set of Computable Objects (Distributions and/or Conditional
  #   having the same left variables than the building blocks.
  #   plDistributionTable (const plVariablesConjunction &left,
  #                        const plVariablesConjunction &right)
  #   A plDistributionTable indexed by the set of variables right.
  P_Wi_K_Spam = plDistributionTable(W[i],Spam)
  print ' '
  print '----- row', i, 'of table 2.2 -----'
  print ' '
  print 'Word', i, ': "',lbs[i], '"'
  print 'P_Wi_K_Spam = P(W[i]|Spam) ==>', P_Wi_K_Spam
  
  # define the two distributions on Wi
  #   plProbTable (const plVariablesConjunction &V,
  #                const std::vector< T > &values,
  #   bool is_already_normalized=false)
  #   Constructor 3: Constructs a probability table on the variable(s) V
  #   and fills it using the values contained in the STL vector values. 

  # one for Spam = 0
  print 'nfi[i] =', nfi[i], '   nf =', nf
  P_Wi1_K_Spam0 = float(1+nfi[i])/float(2+nf)
  P_Wi0_K_Spam0 = 1 - P_Wi1_K_Spam0
  P_Wi_K_Spam0 = plProbTable(W[i],[P_Wi0_K_Spam0, P_Wi1_K_Spam0])
  print 'P_Wi_K_Spam0 = plProbTable(W[i],[P_Wi0_K_Spam0, P_Wi1_K_Spam0]) ==>',P_Wi_K_Spam0
         
  #the other for Spam = 1
  print 'nti[i] =', nti[i], '   nt =', nt
  P_Wi1_K_Spam1 = float(1+nti[i])/float(2+nt)
  P_Wi0_K_Spam1 = 1 - P_Wi1_K_Spam1
  P_Wi_K_Spam1 = plProbTable(W[i],[P_Wi0_K_Spam1, P_Wi1_K_Spam1])
  print 'P_Wi_K_Spam1 = plProbTable(W[i],[P_Wi0_K_Spam1, P_Wi1_K_Spam1]) ==>',P_Wi_K_Spam1
  #
  # void push (const plComputableObject &compObj, int value)
  # Inserts a new distribution compObj with an specified integer key value value. 
  P_Wi_K_Spam.push(P_Wi_K_Spam0, 0)
  P_Wi_K_Spam.push(P_Wi_K_Spam1 ,1)
  print ' '
  print 'P_Wi_K_Spam ==>', P_Wi_K_Spam
  # and store it in a distribution list  
  JointDistributionList.push_back(P_Wi_K_Spam)
print ' '
print '----------- (De-)Composition: JPD-List, JPD = Model ---------------'
print 'JointDistributionList ==>', JointDistributionList
# define the model
# The plJointDistribution class is used to describe a probabilistic model
#   by providing a decomposition of the joint distribution of the whole model
#   variables as a product of elementary conditional and non conditional
#   distributions.
# plJointDistribution(const plVariablesConjunction &all_params,
#                     const plComputableObjectList &decomposition)
#   creates a joint distribution (probabilistic model) as a product of
#   distributions.
print ' '
model = plJointDistribution(Spam^W, JointDistributionList)
print 'model = plJointDistribution(Spam^W, JointDistributionList) ==>', model
print ' '
# model.draw_graph(os.path.join(ExDir, "chapter2", "data", "spam_graph"))
model_graph = model.draw_graph_dot(os.path.join(ExDir, "chapter2", "figures", "spam_graph.dot"))
print 'model_graph = model.draw_graph_dot(os.path.join(ExDir, "chapter2", "figures", "spam_graph.dot")) ==>', model_graph
print ' '
print '============================= Question ============================'

# define the question
#  plCndDistribution ask(const plVariablesConjunction &search_params,
#                        const plVariablesConjunction &known_params,
#                        plOptimizationCriterion optimization=PL_NO_OPTIMIZATION,
#                        bool do_not_build_normalization_expression=false) const
#                        returns CPD P(Spam|W)
question_CPD = model.ask(Spam,W)
print 'formula (2.32) on p.30'
print 'question_CPD = model.ask(Spam,W) ==>', question_CPD
#
# plValues (const plVariablesConjunction &variables)
#   Create a set of plValues allowing to store the values of the variable
#     conjunction variable and initialize its value to the first value of
#     the conjunction. 
vals_of_W = plValues(W)
print ' '
print 'There will be 2**|W| = 2**4 = 32 assignments of values to W'
print 'vals_of_W = plValues(W) ==>', vals_of_W
print ' '
print '============================= Answers ============================='
#use all the possible values to build table 2.3
j = 0
for val_i in vals_of_W:
  j+=1
  i = val_i
  print 'assignment', j,' of W ==>', i
  # plDistribution  instantiate(const plValues &values,
  #                             bool ensure_normalization_on_compute=true) const
  # Produces a new distribution by instantiating the known variables
  # given by a the plValues values. 
  print 'question_CPD.instantiate(i) ==>', question_CPD.instantiate(i)
  # plDistribution plDistribution::compile() const
  # Compiles the distribution using an exhaustive generator
  #  (PL_EXHAUSTIVE_GENERATOR) (i.e. by generating all points of the discrete
  #  or discretized variables space) and stores the result as a table (PL_TABLE).
  #  Note that this method returns a freshly created object.
  #  If you are calling it in a loop, consider using the version that modifies
  #  an existing object instead
  print 'question_CPD.instantiate(i).compile() ==>', question_CPD.instantiate(i).compile()
print ' '
print 'question ==>', question_CPD
print ' '
print '======================== Special Question ========================='
# what is the probability distribution for a mail containing:
# "next" "programming" and "you" ?
evidence = [0, 1, 1, 0, 1]
print 'assignment 14 ==>', evidence
my_question = "P(Spam | "
for k in range(len(evidence)):
    if evidence[k] == 1:
        my_question = my_question + lbs[k] + " " 
my_question = my_question + ")"
w = evidence
print ' '
print 'my_question =', my_question, "==> "
print 'question_CPD.instantiate(w).compile() ==>', question_CPD.instantiate(w).compile()
print '=============================== End =============================='