#!/usr/bin/env python

import sys, os, time, re, datetime

sys.path.append('/data2/')
os.environ['DJANGO_SETTINGS_MODULE'] = 'surftrackr.settings'

from surftrackr.log.models import *
from surftrackr.preferences.models import *

try:
    general_setting = GeneralSetting.objects.all()[0]
except IndexError:
    print 'No settings object found ("General Setting" in the Surftrackr admin interface). Quitting.'
    sys.exit()

if general_setting.logfile:
    logfile = general_setting.logfile
else:
    print 'No logfile specified ("General Setting -> Logfile" in the Surftrackr admin interface). Quitting.'
    sys.exit()

squid_log = re.compile(r'^\d{10}\.\d{3}')
httpd_log = re.compile(r'\[\d{2}/\w{3}/\d{4}:\d{2}:\d{2}:\d{2}')
web_site = re.compile(r'^.*?//(?P<site>.*?)/')
ssl_site = re.compile(r'^(?P<site>.*?:\d+)')
ip_address = re.compile(r'\d+\.\d+\.\d+\.\d+')

tlds_obj = Tld.objects.all().order_by('-size')

def logger(severity, message):
    print message
    message_log = Message.objects.create(date_added=datetime.datetime.now(), severity=severity, source=sys.argv[0], text=message)
    message_log.save()
    return

def get_tld(domain):
    if ip_address.match(domain):
        return domain

    domain = domain.lower()

    if (domain.find(':') > -1):
        domain, port = domain.split(':')

    for tld in tlds_obj:
        suffix = tld.name.lower()
        start = len(suffix) * -1
        if (domain.rfind(suffix, start) > -1):
            return domain[::-1].replace(('.' + suffix)[::-1], '', 1).split('.')[0][::-1] + '.' + suffix

    print "TLD not found: " + domain
    logger(1, "TLD not found: " + domain)
    sys.exit()

def line_parse(line):
    if squid_log.search(line):
        # Native squid format

        try:
            l = line.split()

            if int(l[3].split('/')[1]) > 599: return

            dt_str = datetime.datetime.fromtimestamp(float(l[0])).strftime("%Y-%m-%d %H:%M:%S")
            dt = datetime.datetime(*time.strptime(dt_str, "%Y-%m-%d %H:%M:%S")[0:6])

            if ip_address.search(l[2]):
                workstation_obj, created = Workstation.objects.get_or_create(ip_addr=l[2])
            else:
                workstation_obj, created = Workstation.objects.get_or_create(dns_name=l[2])

            if (l[7] == '-'):
                username = l[2] + '_user'
            else:
                username = l[7]

            user_obj, created = User.objects.get_or_create(username=username, flagged=False, exclude_from_database=False, exclude_from_reports=False)
            if not created:
                if (user_obj.exclude_from_database == True): return
            user_obj.workstation.add(workstation_obj)

            if web_site.search(l[6]):
                website = web_site.search(l[6])
                url = l[6]
            elif ssl_site.search(l[6]):
                website = ssl_site.search(l[6])
                url = 'https://' + l[6]

            filepart = url.split('/')[-1]
            if filepart.find('.'):
                try:
                    extension = filepart.split('?')[0].split('.')[-1]
                    mime_type = MimeType.objects.get(extension=extension)
                except MimeType.DoesNotExist:
                    mime_type = MimeType.objects.get(pk=0)

            website_obj, created = Website.objects.get_or_create(name=website.group('site'))
            if created: website_obj.last_visit = dt
            if (dt > website_obj.last_visit): website_obj.last_visit = dt
            website_obj.save()

            website_base_name_obj, created = Website.objects.get_or_create(name=get_tld(website_obj.name))
            if created:
                website_base_name_obj.top_level_site = website_base_name_obj
                website_base_name_obj.last_visit = dt
            if (dt > website_base_name_obj.last_visit): website_base_name_obj.last_visit = dt
            website_base_name_obj.save()

            website_obj.top_level_site = website_base_name_obj
            website_obj.save()

            method_obj = Method.objects.get(name=l[5])
            status_obj = Status.objects.get(code=l[3].split('/')[1])

            weblog_obj, created = Weblog.objects.get_or_create(\
                user = user_obj, \
                log_datetime = dt, \
                method = method_obj, \
                url = url, \
                status = status_obj, \
                size = l[4], \
                website = website_obj, \
                mimetype = mime_type, \
                flagged = 0, \
            )

        except:
            logger(2, 'Problem with line: "' + line + '"')

    elif httpd_log.search(line):
        # Common log format (httpd)

        try:
            line = line.replace('[', '').replace(']', '').replace('"', '')
            l = line.split()

            if int(l[8]) > 599: return

            dt = datetime.datetime(*time.strptime(l[3].split(' ')[0], "%d/%b/%Y:%H:%M:%S")[0:6])

            if ip_address.search(l[0]):
                workstation_obj, created = Workstation.objects.get_or_create(ip_addr=l[0])
            else:
                workstation_obj, created = Workstation.objects.get_or_create(dns_name=l[0])

            if (l[1] == '-'):
                username = l[0] + '_user'
            else:
                username = l[1]

            user_obj, created = User.objects.get_or_create(username=username, flagged=False, exclude_from_database=False, exclude_from_reports=False)
            if not created:
                if (user_obj.exclude_from_database == True): return
            user_obj.workstation.add(workstation_obj)

            if web_site.search(l[6]):
                website = web_site.search(l[6])
                url = l[6]
            elif ssl_site.search(l[6]):
                website = ssl_site.search(l[6])
                url = 'https://' + l[6]

            filepart = url.split('/')[-1]
            if filepart.find('.'):
                try:
                    extension = filepart.split('?')[0].split('.')[-1]
                    mime_type = MimeType.objects.get(extension=extension)
                except MimeType.DoesNotExist:
                    mime_type = MimeType.objects.get(pk=0)

            website_obj, created = Website.objects.get_or_create(name=website.group('site'))
            if created: website_obj.last_visit = dt
            if (dt > website_obj.last_visit): website_obj.last_visit = dt
            website_obj.save()

            website_base_name_obj, created = Website.objects.get_or_create(name=get_tld(website_obj.name))
            if created:
                website_base_name_obj.top_level_site = website_base_name_obj
                website_base_name_obj.last_visit = dt
            if (dt > website_base_name_obj.last_visit): website_base_name_obj.last_visit = dt
            website_base_name_obj.save()

            website_obj.top_level_site = website_base_name_obj
            website_obj.save()

            method_obj = Method.objects.get(name=l[5])
            status_obj = Status.objects.get(code=l[8])

            weblog_obj, created = Weblog.objects.get_or_create(\
                user = user_obj, \
                log_datetime = dt, \
                method = method_obj, \
                url = url, \
                status = status_obj, \
                size = l[9], \
                website = website_obj, \
                mimetype = mime_type, \
                flagged = 0, \
            )
        except:
            logger(2, 'Problem with line: "' + line + '"')


def main():
    """ Tail the logfile and enter details into the database.
    """

    # "Tail" code from http://aspn.activestate.com/ASPN/Cookbook/Python/Recipe/157035
    file = open(logfile,'r')

    # Find the size of the file and move to the end
    st_results = os.stat(logfile)
    st_size = st_results[6]

    file.seek(st_size)

    while 1:
        where = file.tell()
        line = file.readline().strip()
        if not line:
            time.sleep(1)
            file.seek(where)
        else:
            line_parse(line)

if __name__ == "__main__":
    # Daemon code from http://aspn.activestate.com/ASPN/Cookbook/Python/Recipe/66012

    # Do the UNIX double-fork magic, see Stevens' "Advanced 
    # Programming in the UNIX Environment" for details (ISBN 0201563177)

    try: 
        pid = os.fork() 
        if pid > 0:
            # exit first parent
            sys.exit(0) 
    except OSError, e: 
        print >>sys.stderr, "fork #1 failed: %d (%s)" % (e.errno, e.strerror) 
        sys.exit(1)

    # Decouple from parent environment
    os.chdir("/") 
    os.setsid() 
    os.umask(0) 

    # Do second fork
    try: 
        pid = os.fork() 
        if pid > 0:
            # Exit from second parent, print eventual PID before
            print "Daemon PID %d" % pid 
            sys.exit(0) 
    except OSError, e: 
        print >>sys.stderr, "fork #2 failed: %d (%s)" % (e.errno, e.strerror) 
        sys.exit(1) 

    # Start the daemon main loop
    main()










