
/*
 * LIB/SPAMFILTER.C	- Spam filtering.  
 *
 *	The spam filter is really simple.  The NNTP-Posting-Host: header
 *	is placed in a relatively large cache with a 32 bit 'hit' counter.
 *	The counter is incremented for each hit, and decremented once a
 *	minute.  If the counter goes above 4, the entry is locked.  If
 *	the counter goes above 16, the posting source is filtered by 
 *	adding the message-id to the history file prior to the article
 *	commit.  The cache entry is not unlocked until the counter 
 *	returns to 0.
 *
 *	NOTE: We cannot open FilterFd in InitSpamFilter() because this
 *	occurs prior to any server forks and will cause the fcntl locks
 *	to be shared across the forks.  This means that if a child 
 *	creates a lock and is then killed, the lock will NOT automatically
 *	be removed.  Thus, we open FilterFd on the first call to SpamFilter()
 *	rather then in InitSpamFilter().
 *
 * (c)Copyright 1998, Matthew Dillon, All Rights Reserved.  Refer to
 *    the COPYRIGHT file in the base directory of this distribution
 *    for specific rights granted.
 */

#include "defs.h"

Prototype void SetSpamFilterOpt(void);
Prototype void SetFilterTrip(int n, int filterByNNTPHost);
Prototype void InitSpamFilter(void);
Prototype void TermSpamFilter(void);
Prototype int SpamFilter(hash_t mhv, const char *nntpPostingHost, hash_t *bhv, int *phow);
Prototype void TermSpamFilter(void);
Prototype void DumpSpamFilterCache(FILE *fo);
Prototype int FilterFd;

#define F_HSIZE		65536
#define F_HMASK		(F_HSIZE - 1)
#define F_EXPIRE	(60 * 60)	/* one hour expire */

typedef struct Filter {
    hash_t	f_Hash;		/* hash of the NNTP-Posting-Host: 	*/
    hash_t	f_MHash;	/* message-id hash			*/
    time_t	f_Time;		/* time demark				*/
    int32	f_HitCount;	/* incremented per hit, decremented per min */
    int32	f_FilterCount;	/* filtered postings			*/
} Filter;

Filter *FilterAry;
int	FilterFd = -1;
int	FilterLock = 4;
int	FilterTrip = 0;
int	FilterMax  = 100;
int	FilterByNNTPHost = 0;

int SpamFilterTable(time_t t, hash_t mhv, hash_t *hv, int sensitivity);

void
SetSpamFilterOpt(void)
{
    if (DOpts.SpamFilterOpt != NULL) {
	char *ptr = DOpts.SpamFilterOpt;
	int n;
	int enabled = 0;
	int nntpPostDisabled = 0;

	while (*ptr) {
	    switch (*ptr) {
		case ' ':
		    ++ptr;
		    break;
		case 'B':
		    ++ptr;
		    n = strtol(ptr, NULL, 0);
		    SetFilterTrip(n, -1);
		    while (isdigit((int)*ptr))
			++ptr;
		    if (n > 0)
			enabled = 1;
		    break;
		case 'D':
		    ++ptr;
		    SetFilterTrip(-1, 0);
		    nntpPostDisabled = 1;
		    break;
		case 'N':
		    ++ptr;
		    n = strtol(ptr, NULL, 0);
		    SetFilterTrip(-1, n);
		    while (isdigit((int)*ptr))
			++ptr;
		    if (n > 0)
			enabled = 1;
		    break;
		case 'Z':
		    ++ptr;
		    SetFilterTrip(0, 0);
		    break;
		default:
		    n = strtol(ptr, NULL, 0);
		    if (nntpPostDisabled)
			SetFilterTrip(n / 4 + 2, 0);
		    else
			SetFilterTrip(n / 4 + 2, n);
		    while (isdigit((int)*ptr))
			++ptr;
		    if (n > 0)
			enabled = 1;
		    break;
	    }
	}
	if (enabled) {
	    InitSpamFilter();
	} else {
	    free(DOpts.SpamFilterOpt);
	    DOpts.SpamFilterOpt = NULL;
	}
    }
}

void
SetFilterTrip(int n, int filterByNNTPHost)
{
    if (n >= 0)
	FilterTrip = n;
    if (filterByNNTPHost >= 0)
	FilterByNNTPHost = filterByNNTPHost;
}

/*
 * InitSpamFilter() - called by master diablo server to initialize the 
 *		      shared memory segment for the spam filter.
 *
 *		      we allocate, map, then remove the shared memory id
 *		      so it is not persistant after the last diablo process
 *		      goes away.  This is necessary because there is really
 *		      no way to reserve a permanent id without possibly 
 *		      stomping on someone else in the system using shared
 *		      memory.
 *
 *		      NOTE: we can open FilterFd here even though fork() will
 *		      share the lseek position because we do not use lseek
 *		      if USE_SPAM_SHM is set.  We also read any preexisting
 *		      spam cache into the shared memory segment.
 */

void
InitSpamFilter(void)
{
    int sid;
    struct shmid_ds ds;

    if (FilterAry != NULL)
	return;
#if USE_SPAM_SHM
    sid = shmget(IPC_PRIVATE, F_HSIZE * sizeof(Filter), SHM_R|SHM_W);

    if (sid < 0) {
        syslog(LOG_CRIT, "sysv shared memory alloc failed, is your machine configured with a high enough maximum segment size?");
        exit(1);
    }

    FilterAry = (Filter *)shmat(sid, NULL, SHM_R|SHM_W);

    if (shmctl(sid, IPC_STAT, &ds) < 0 || shmctl(sid, IPC_RMID, &ds) < 0) {
        syslog(LOG_CRIT, "sysv shmctl stat/rmid failed");
        exit(1);
    }
    if (FilterAry == (Filter *)-1) {
        FilterAry = NULL;
        syslog(LOG_CRIT, "sysv shared memory map failed");
        exit(1);
    }
    bzero(FilterAry, F_HSIZE * sizeof(Filter));
    {
	int fd = open(PatDbExpand(SpamCachePat), O_RDWR|O_CREAT, 0644);

	if (fd < 0) {
	    syslog(LOG_CRIT, "unable to create spam.cache for lock reference");
	    exit(1);
	}
	(void)read(fd, FilterAry, F_HSIZE * sizeof(Filter));
	close(fd);
    }
    syslog(LOG_INFO, "Initialised internal spam filter (bodytrip=%d  hosttrip=%d)",
				FilterTrip, FilterByNNTPHost);
#endif
}

void
TermSpamFilter(void)
{
#if USE_SPAM_SHM
    if (FilterFd >= 0 && FilterAry) {
	lseek(FilterFd, 0L, 0);
	write(FilterFd, FilterAry, F_HSIZE * sizeof(Filter));
	ftruncate(FilterFd, F_HSIZE * sizeof(Filter));
    }
    if (FilterFd >= 0) {
	close(FilterFd);
	FilterFd = -1;
    }
    if (FilterAry) {
	shmdt((void *)FilterAry);
	FilterAry = NULL;
    }
#endif
}

/*
 * SpamFilter() - run spam filter on message-id hash, optional
 * nntpPostingHost.  If nntpPostingHost is not provided, it must
 * be "".  This filter will rate-filter based on nntpPostingHost
 */

int
SpamFilter(hash_t mhv, const char *nntpPostingHost, hash_t *bhv, int *phow)
{
    int r = 0;
    time_t t;

    /*
     * open cache
     */

    t = time(NULL);

    if (FilterAry == NULL) {
#if USE_SPAM_SHM
	syslog(LOG_CRIT, "unable to initialize spam filter cache");
	exit(1);
#else
	struct stat st;

	FilterFd = open(PatDbExpand(SpamCachePat), O_RDWR|O_CREAT, 0644);

	if (FilterFd >= 0 && fstat(FilterFd, &st) == 0) {
	    if (st.st_size < F_HSIZE * sizeof(Filter))
		ftruncate(FilterFd, F_HSIZE * sizeof(Filter));
	    FilterAry = xmap(
		NULL, 
		F_HSIZE * sizeof(Filter),
		PROT_READ | (USE_SPAM_RW_MAP * PROT_WRITE),
		MAP_SHARED,
		FilterFd,
		0
	    );
	}
#endif
    }
    if (FilterFd < 0) {
	FilterFd = open(PatDbExpand(SpamCachePat), O_RDWR|O_CREAT, 0644);
    }

    if (FilterAry == NULL && FilterFd >= 0) {
	close(FilterFd);
	FilterFd = -1;
    }

    *phow = 0;

    if (FilterAry) {
	if (r == 0 && 
	    nntpPostingHost && 
	    nntpPostingHost[0] &&
	    FilterByNNTPHost
	) {
	    hash_t hv = hhash(nntpPostingHost);

	    r = SpamFilterTable(t, mhv, &hv, FilterByNNTPHost);
	    *phow = 1;
	}
	if (r == 0 && bhv != NULL && FilterTrip) {
	    ++mhv.h1;	/* use slightly different hash for dup body chk */
	    r = SpamFilterTable(t, mhv, bhv, FilterTrip);
	    *phow = 2;
	}
    }
    return(r);
}

/*
 * Execute spam filter on hash code
 */

int
SpamFilterTable(time_t t, hash_t mhv, hash_t *hv, int sensitivity)
{
    int r = 0;
    int i = (hv->h1 ^ hv->h2) & F_HMASK;	/* hash index */
    int off = i * sizeof(Filter);	/* map offset */
    Filter *f = &FilterAry[i];		/* structural pointer	*/
    time_t t0;
    int dhits = 0;
    int isdup = 1;

    hflock(FilterFd, off, XLOCK_EX);

    t0 = f->f_Time;

    /*
     * calculate delta hits
     */
    {
	int32 dt = (int)(t - t0);

	if (t0 == 0 || dt < -10 || dt > F_EXPIRE) {
	    /*
	     * Slot is garbaged or long-expired, reset
	     * it.  Set dhits to force override.  Reset
	     * t0.
	     */
	    dhits = -f->f_HitCount;
	    t0 = t;
	} else {
	    /*
	     * Slot ok (but may or may not match hash code).
	     *
	     * calculate per-minute rate, adjust dhits as if hash
	     * code were ok.
	     */
	    while (dt >= 60 && f->f_HitCount + dhits > 0) {
		--dhits;
		t0 += 60;
		dt -= 60;
	    }
	    /*
	     * Check for duplicate message-id.  If not a duplicate,
	     * enable write-back and bump dhits.
	     */
	    if (f->f_MHash.h1 != mhv.h1 || f->f_MHash.h2 != mhv.h2) {
		++dhits;
		isdup = 0;
	    }
	}
    }

    /*
     * cache hit / miss
     */

    if (f->f_Hash.h1 == hv->h1 &&
	f->f_Hash.h2 == hv->h2
    ) {
	/*
	 * same-slot, valid.
	 */
	Filter copy;

	copy = *f;

	copy.f_Time = t0;
	copy.f_HitCount += dhits;
	copy.f_MHash = mhv;

	if (copy.f_HitCount <= 0) {	/* handle garbage  */
	    copy.f_HitCount = 0;
	    copy.f_Time = t;
	}
	if (copy.f_HitCount >= FilterMax)	/* handle garbage  */
	    copy.f_HitCount = FilterMax;

	if (copy.f_HitCount >= sensitivity) {
	    if (isdup == 0)
		++copy.f_FilterCount;
	    r = -copy.f_FilterCount;
	    if (r >= 0)		/* make sure r is negative */
		r = -1;
	}
	if (isdup == 0) {
#if USE_SPAM_RW_MAP
	    *f = copy;
#else
	    lseek(FilterFd, off, 0);		/* seek & lock	*/
	    write(FilterFd, &copy, sizeof(Filter));
#endif
	}
    } else if (f->f_HitCount + dhits < FilterLock) {
	/*
	 * reset slot
	 */
	Filter copy = { { 0 } };

	copy.f_Hash = *hv;
	copy.f_MHash = mhv;
	copy.f_Time = t;
	copy.f_HitCount = 1;
	copy.f_FilterCount = 0;

#if USE_SPAM_RW_MAP
	*f = copy;
#else
	lseek(FilterFd, off, 0);		/* seek & lock	*/
	write(FilterFd, &copy, sizeof(Filter));
#endif
    } else {
	syslog(LOG_INFO, "SpamFilter, slot %d in use: dt=%d, %d + %d\n",
	    i, 
	    (int)(t - f->f_Time),
	    f->f_HitCount, 
	    dhits
	);
    }
    hflock(FilterFd, off, XLOCK_UN);

    return(r);
}

void
DumpSpamFilterCache(FILE *fo)
{
    hash_t mhv = { 0 };
    time_t t = time(NULL);
    int dummyHow;

    SpamFilter(mhv, NULL, NULL, &dummyHow);

    if (FilterAry) {
	int i;
	int anyinfo = 0;

	for (i = 0; i < F_HSIZE; ++i) {
	    Filter *f = &FilterAry[i];
	    int32 dt = t - f->f_Time;
	    int32 odt = dt;
	    int hits = f->f_HitCount;

	    if (f->f_Time == 0)
		continue;
	    if (dt >= F_EXPIRE)
		continue;
	    while (hits > 0 && dt >= 60) {
		--hits;
		dt -= 60;
	    }
	    if (hits >= 0) {
		if (!anyinfo)
		    fprintf(fo, "%5s %17s %17s %7s\n",
				"entry",
				"hash",
				"hash",
				"time"
		    );
		anyinfo = 1;
		fprintf(fo, "%05x %08x.%08x %08x.%08x dt=%-4d hits=%-5d filtered=%-5d\n",
				i,
				(int)f->f_Hash.h1,
				(int)f->f_Hash.h2,
				(int)f->f_MHash.h1,
				(int)f->f_MHash.h2,
				(int)odt,
				(int)hits,
				(int)f->f_FilterCount
		);
	    }
	}
	if (!anyinfo)
	    fprintf(fo, "No spam hits\n");
    }
}

