/* Ustawienie katalogu domowego (katalogu instalacji oprogramowania) */
%default HOME `echo \$HOME/Software/`

/* Avro używa biblioteki json-simple i jest w bibliotekach piggybank; od wersji Pig 0.12 AvroStorage i TrevniStorage są elementami wbudowanymi */
REGISTER $HOME/pig/build/ivy/lib/Pig/avro-1.5.3.jar
REGISTER $HOME/pig/build/ivy/lib/Pig/json-simple-1.1.jar
REGISTER $HOME/pig/contrib/piggybank/java/piggybank.jar

DEFINE AvroStorage org.apache.pig.piggybank.storage.avro.AvroStorage();

/* Biblioteki i konfiguracja MongoDB */
REGISTER $HOME/mongo-hadoop/mongo-2.10.1.jar
REGISTER $HOME/mongo-hadoop/core/target/mongo-hadoop-core-1.1.0-SNAPSHOT.jar
REGISTER $HOME/mongo-hadoop/pig/target/mongo-hadoop-pig-1.1.0-SNAPSHOT.jar

DEFINE MongoStorage com.mongodb.hadoop.pig.MongoStorage();

set default_parallel 5
set mapred.map.tasks.speculative.execution false
set mapred.reduce.tasks.speculative.execution false

/* Makrodefinicja filtrująca wiadomości według występowania par nagłówków
  [from, to, cc, bcc, reply_to] a następnie dokonująca projekcji nagłówka,
  identyfikatora wiadomości i tematu, emitowanych łącznie
  (po zamianie na małe litery).
  UWAGA: Od wersji Pig 0.11 nie można przekazywać makrodefinicji
  do Grunta - trzeba uruchomić ten plik */
DEFINE headers_messages(email, col) RETURNS set { 
  filtered = FILTER $email BY ($col IS NOT NULL);
  flat = FOREACH filtered GENERATE FLATTEN($col.address) AS $col, message_id, subject, date;
  lowered = FOREACH flat GENERATE LOWER($col) AS address, message_id, subject, date;
  $set = FILTER lowered BY (address IS NOT NULL) and (address != '') and (date IS NOT NULL);
}

/* Czyszczenie bazy MongoDB, utworzymy nową. */
-- sh mongo agile_data --quiet --eval 'db.emails_per_address.drop(); exit();'
-- sh mongo agile_data --quiet --eval 'db.addresses_per_email.drop(); exit();'

rmf /tmp/emails_per_address.json

emails = load '/me/Data/test_mbox' using AvroStorage();
froms = foreach emails generate LOWER(from.address) as address, message_id, subject, date;
froms = filter froms by (address IS NOT NULL) and (address != '') and (date IS NOT NULL);
tos = headers_messages(emails, 'tos');
ccs = headers_messages(emails, 'ccs');
bccs = headers_messages(emails, 'bccs');
reply_tos = headers_messages(emails, 'reply_tos');

address_messages = UNION froms, tos, ccs, bccs, reply_tos;

/* Wiadomości grupowane według adresu, uporządkowane po dacie (malejąco); dla zachowania szybkości dostępu -- najwyżej 50 sztuk. */
emails_per_address = foreach (group address_messages by address) { 
                             address_messages = order address_messages by date desc;
                             top_50 = limit address_messages 50;
                             generate group as address, 
                                      top_50.(message_id, subject, date) as emails; 
                             }

store emails_per_address into 'mongodb://localhost/agile_data.emails_per_address' using MongoStorage();

/* adresy per e-mail */
addresses_per_email = foreach (group address_messages by message_id) generate group as message_id, address_messages.(address) as addresses;
store addresses_per_email into 'mongodb://localhost/agile_data.addresses_per_email' using MongoStorage();
