#!/usr/bin/env python

import sys, os, re, datetime, time

sys.path.append('/data/')
os.environ['DJANGO_SETTINGS_MODULE'] = 'surftrackr.settings'

from django.db import models
from surftrackr.log.models import *
from surftrackr.preferences.models import *
from django.contrib.auth.models import User as DjangoUser, check_password

squid_log = re.compile(r'^\d{10}\.\d{3}')
httpd_log = re.compile(r'\[\d{2}/\w{3}/\d{4}:\d{2}:\d{2}:\d{2}')
web_site = re.compile(r'^.*?//(?P<site>.*?)/')
ssl_site = re.compile(r'^(?P<site>.*?:\d+)')
ip_address = re.compile(r'\d+\.\d+\.\d+\.\d+')

def logger(severity, message):
    print message
    message_log = Message.objects.create(date_added=datetime.datetime.now(), severity=severity, source=sys.argv[0], text=message)
    message_log.save()
    return

def get_tld(domain):
    if ip_address.match(domain):
        return domain

    domain = domain.lower()

    if (domain.find(':') > -1):
        domain, port = domain.split(':')

    for tld in tlds_obj:
        suffix = tld.name.lower()
        start = len(suffix) * -1
        if (domain.rfind(suffix, start) > -1):
            return domain[::-1].replace(('.' + suffix)[::-1], '', 1).split('.')[0][::-1] + '.' + suffix

    logger(1, "TLD not found: " + domain)

def log_parse(log_filename):
    try:
        logfile = open(log_filename, 'r')
    except IOError:
        logger(1, 'Cannot open logfile: "' + log_filename + '". Nothing processed. Quitting.')
        sys.exc_info()[0]
        return

    logger(3, 'Started processing logfile: "' + log_filename + '"')
    total_lines = 0

    usermap = dict([(str(u.workstation), str(u.user)) for u in UserMapping.objects.all()])

    try:
        manager_user = DjangoUser.objects.get(username='surftrackr')
    except DjangoUser.DoesNotExist:
        pw = DjangoUser.objects.make_random_password(length=10, allowed_chars='abcdefghjkmnpqrstuvwxyzABCDEFGHJKLMNPQRSTUVWXYZ23456789')
        manager_user = DjangoUser.objects.create_user(username='surftrackr', email='surftrackr@example.com', password=pw)
        manager_user.is_staff = True
        manager_user.is_superuser = True
        manager_user.save()

    for line in logfile:
        if squid_log.search(line):
            # Native squid format

            try:
                l = line.split()

                if int(l[3].split('/')[1]) > 599: continue

                dt_str = datetime.datetime.fromtimestamp(float(l[0])).strftime("%Y-%m-%d %H:%M:%S")
                dt = datetime.datetime(*time.strptime(dt_str, "%Y-%m-%d %H:%M:%S")[0:6])

                if ip_address.search(l[2]):
                    workstation_obj, created = Workstation.objects.get_or_create(ip_addr=l[2])
                else:
                    workstation_obj, created = Workstation.objects.get_or_create(dns_name=l[2])

                if (l[7] == '-'):
                    # print ":" + l[2] + ":"
                    if l[2] in usermap:
                        username = usermap[l[2]]
                    else:
                        username = l[2] + '_user'
                else:
                    username = l[7]

                user_obj, created = User.objects.get_or_create(username=username, flagged=False, exclude_from_database=False, exclude_from_reports=False)
                if created:
                    user_obj.managed_by = manager_user
                    user_obj.save()
                else:
                    if (user_obj.exclude_from_database == True): return
                user_obj.workstation.add(workstation_obj)

                if web_site.search(l[6]):
                    website = web_site.search(l[6])
                    url = l[6]
                elif ssl_site.search(l[6]):
                    website = ssl_site.search(l[6])
                    url = 'https://' + l[6]

                filepart = url.split('/')[-1]
                if filepart.find('.'):
                    try:
                        extension = filepart.split('?')[0].split('.')[-1]
                        mime_type = MimeType.objects.get(extension=extension)
                    except MimeType.DoesNotExist:
                        mime_type = MimeType.objects.get(description='unknown')

                website_obj, created = Website.objects.get_or_create(name=website.group('site'))
                if created: website_obj.last_visit = dt
                if (dt > website_obj.last_visit): website_obj.last_visit = dt
                website_obj.save()

                website_base_name_obj, created = Website.objects.get_or_create(name=get_tld(website_obj.name))
                if created:
                    website_base_name_obj.top_level_site = website_base_name_obj
                    website_base_name_obj.last_visit = dt
                if (dt > website_base_name_obj.last_visit): website_base_name_obj.last_visit = dt
                website_base_name_obj.save()

                website_obj.top_level_site = website_base_name_obj
                website_obj.save()

                method_obj = Method.objects.get(name=l[5])
                status_obj = Status.objects.get(code=l[3].split('/')[1])

                weblog_obj, created = Weblog.objects.get_or_create(\
                    user = user_obj, \
                    log_datetime = dt, \
                    method = method_obj, \
                    url = url, \
                    status = status_obj, \
                    size = l[4], \
                    website = website_obj, \
                    mimetype = mime_type, \
                    flagged = 0, \
                )

            except:
                logger(2, 'Problem with line: "' + line + '" from file "' + log_filename + '"')
                sys.exc_info()[0]

        elif httpd_log.search(line):
            # Common log format (httpd)

            try:
                line = line.replace('[', '').replace(']', '').replace('"', '')
                l = line.split()

                if int(l[8]) > 599: continue

                dt = datetime.datetime(*time.strptime(l[3].split(' ')[0], "%d/%b/%Y:%H:%M:%S")[0:6])

                if ip_address.search(l[0]):
                    workstation_obj, created = Workstation.objects.get_or_create(ip_addr=l[0])
                else:
                    workstation_obj, created = Workstation.objects.get_or_create(dns_name=l[0])

                if (l[1] == '-'):
                    username = l[0] + '_user'
                else:
                    username = l[1]

                user_obj, created = User.objects.get_or_create(username=username, flagged=False, exclude_from_database=False, exclude_from_reports=False)
                if created:
                    user_obj.managed_by = manager_user
                    user_obj.save()
                else:
                    if (user_obj.exclude_from_database == True): return
                user_obj.workstation.add(workstation_obj)

                if web_site.search(l[6]):
                    website = web_site.search(l[6])
                    url = l[6]
                elif ssl_site.search(l[6]):
                    website = ssl_site.search(l[6])
                    url = 'https://' + l[6]

                if (l[1] == '-'):
                    if l[0] in usermap:
                        username = usermap[l[0]]
                    else:
                        username = l[0] + '_user'
                else:
                    username = l[1]

                filepart = url.split('/')[-1]
                if filepart.find('.'):
                    try:
                        extension = filepart.split('?')[0].split('.')[-1]
                        mime_type = MimeType.objects.get(extension=extension)
                    except MimeType.DoesNotExist:
                        mime_type = MimeType.objects.get(description='unknown')

                website_obj, created = Website.objects.get_or_create(name=website.group('site'))
                if created: website_obj.last_visit = dt
                if (dt > website_obj.last_visit): website_obj.last_visit = dt
                website_obj.save()

                website_base_name_obj, created = Website.objects.get_or_create(name=get_tld(website_obj.name))
                if created:
                    website_base_name_obj.top_level_site = website_base_name_obj
                    website_base_name_obj.last_visit = dt
                if (dt > website_base_name_obj.last_visit): website_base_name_obj.last_visit = dt
                website_base_name_obj.save()

                website_obj.top_level_site = website_base_name_obj
                website_obj.save()

                method_obj = Method.objects.get(name=l[5])
                status_obj = Status.objects.get(code=l[8])

                weblog_obj, created = Weblog.objects.get_or_create(\
                    user = user_obj, \
                    log_datetime = dt, \
                    method = method_obj, \
                    url = url, \
                    status = status_obj, \
                    size = l[9], \
                    website = website_obj, \
                    mimetype = mime_type, \
                    flagged = 0, \
                )
            except:
                logger(2, 'Problem with line: "' + line + '" from file "' + log_filename + '"')
                sys.exc_info()[0]

        total_lines += 1

    logger(3, 'Finished processing ' + str(total_lines) + ' lines from logfile: "' + log_filename + '"')

#---------------------------------------------------------------------------------------------------

tlds_obj = Tld.objects.all().order_by('-size')

for root, dirs, files in os.walk('logs'):
    if len(files) > 0:
        files=[filename for filename in files if filename[0] != '.']

        for file in files:
            print root + '/' + file

            filelock, created = FileLock.objects.get_or_create(filename=root + '/' + file)

            if created:
                filelock.started = datetime.datetime.now()
                filelock.save()
            else:
                continue

            log_parse(root + '/' + file)

            filelock.finished = datetime.datetime.now()
            filelock.save()

            os.system('/bin/mkdir -p logs-processed/' + root)
            os.rename(root + '/' + file, 'logs-processed/' + root + '/' + file)







