/*
 * KHeiseReg
 *
 * A utility to search for articles within the Heise register.
 *
 * Copyright (C) 2002 Oliver Gantz <Oliver.Gantz@epost.de>
 *
 * This program is free software; you can redistribute it and/or modify
 * it under the terms of the GNU General Public License as published by
 * the Free Software Foundation; either version 2 of the License, or
 * (at your option) any later version.
 *
 * This program is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU General Public License for more details.
 *
 * You should have received a copy of the GNU General Public License
 * along with this program; if not, write to the Free Software
 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
 */

#include <stdlib.h>
#include <qcstring.h>

#include "regfile.h"
#include "global.h"



static const char ibm_to_latin1[256] = {
	0x00,0x01,0x02,0x03,0x04,0x05,0x06,0x07,0x08,0x09,0x0a,0x0b,0x0c,0x0d,0x0e,0x0f,
	0x10,0x11,0x12,0x13,0xb6,0xa7,0x16,0x17,0x18,0x19,0x1a,0x1b,0x1c,0x1d,0x1e,0x1f,
	0x20,0x21,0x22,0x23,0x24,0x25,0x26,0x27,0x28,0x29,0x2a,0x2b,0x2c,0x2d,0x2e,0x2f,
	0x30,0x31,0x32,0x33,0x34,0x35,0x36,0x37,0x38,0x39,0x3a,0x3b,0x3c,0x3d,0x3e,0x3f,
	0x40,0x41,0x42,0x43,0x44,0x45,0x46,0x47,0x48,0x49,0x4a,0x4b,0x4c,0x4d,0x4e,0x4f,
	0x50,0x51,0x52,0x53,0x54,0x55,0x56,0x57,0x58,0x59,0x5a,0x5b,0x5c,0x5d,0x5e,0x5f,
	0x60,0x61,0x62,0x63,0x64,0x65,0x66,0x67,0x68,0x69,0x6a,0x6b,0x6c,0x6d,0x6e,0x6f,
	0x70,0x71,0x72,0x73,0x74,0x75,0x76,0x77,0x78,0x79,0x7a,0x7b,0x7c,0x7d,0x7e,0x7f,
	0xc7,0xfc,0xe9,0xe2,0xe4,0xe0,0xe5,0xe7,0xea,0xeb,0xe8,0xef,0xee,0xec,0xc4,0xc5,
	0xc9,0xe6,0xc6,0xf4,0xf6,0xf2,0xfb,0xf9,0xff,0xd6,0xdc,0xa2,0xa3,0xa5,0x20,0x20,
	0xe1,0xed,0xf3,0xfa,0xf1,0xd1,0xaa,0xba,0xbf,0x20,0xac,0xbd,0xbc,0xa1,0xab,0xbb,
	0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,
	0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,
	0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,
	0x20,0xdf,0x20,0x20,0x20,0x20,0xb5,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,
	0x20,0xb1,0x20,0x20,0x20,0x20,0xf7,0x20,0xb0,0x20,0xb7,0x20,0x20,0xb2,0x20,0x20
};



static const char latin1_to_ibm[256] = {
	0x00,0x01,0x02,0x03,0x04,0x05,0x06,0x07,0x08,0x09,0x0a,0x0b,0x0c,0x0d,0x0e,0x0f,
	0x10,0x11,0x12,0x13,0x20,0x20,0x16,0x17,0x18,0x19,0x1a,0x1b,0x1c,0x1d,0x1e,0x1f,
	0x20,0x21,0x22,0x23,0x24,0x25,0x26,0x27,0x28,0x29,0x2a,0x2b,0x2c,0x2d,0x2e,0x2f,
	0x30,0x31,0x32,0x33,0x34,0x35,0x36,0x37,0x38,0x39,0x3a,0x3b,0x3c,0x3d,0x3e,0x3f,
	0x40,0x41,0x42,0x43,0x44,0x45,0x46,0x47,0x48,0x49,0x4a,0x4b,0x4c,0x4d,0x4e,0x4f,
	0x50,0x51,0x52,0x53,0x54,0x55,0x56,0x57,0x58,0x59,0x5a,0x5b,0x5c,0x5d,0x5e,0x5f,
	0x60,0x61,0x62,0x63,0x64,0x65,0x66,0x67,0x68,0x69,0x6a,0x6b,0x6c,0x6d,0x6e,0x6f,
	0x70,0x71,0x72,0x73,0x74,0x75,0x76,0x77,0x78,0x79,0x7a,0x7b,0x7c,0x7d,0x7e,0x7f,
	0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,
	0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,0x20,
	0x20,0xad,0x9b,0x9c,0x20,0x9d,0x20,0x15,0x20,0x20,0xa6,0xae,0xaa,0x20,0x20,0x20,
	0xf8,0xf1,0xfd,0x20,0x20,0xe6,0x14,0xfa,0x20,0x20,0xa7,0xaf,0xac,0xab,0x20,0xa8,
	0x20,0x20,0x20,0x20,0x8e,0x8f,0x92,0x80,0x20,0x90,0x20,0x20,0x20,0x20,0x20,0x20,
	0x20,0xa5,0x20,0x20,0x20,0x20,0x99,0x20,0x20,0x20,0x20,0x20,0x9a,0x20,0x20,0xe1,
	0x85,0xa0,0x83,0x20,0x84,0x86,0x91,0x87,0x8a,0x82,0x88,0x89,0x8d,0xa1,0x8c,0x8b,
	0x20,0xa4,0x95,0xa2,0x93,0x20,0x94,0xf6,0x20,0x97,0xa3,0x96,0x81,0x20,0x20,0x98
};




RegSearchList::RegSearchList()
{
}


void RegSearchList::setKeyWords(const QString &text)
{
	int l, s, e;
	bool inc, exc;
	QCString input(text.latin1());

	m_include.clear();
	m_require.clear();
	m_exclude.clear();

	l = input.length();

	for (s = 0; s < l; s++)
		input[s] = latin1_to_ibm[(unsigned char)input[s]];

	s = 0;
	while (s < l) {
		inc = exc = false;
		while (input.at(s) == ' ')
			s++;
		if (s == l)
			break;
		if (input.at(s) == '+') {
			s++;
			inc = true;
		} else if (input.at(s) == '-') {
			s++;
			exc = true;
		}
		e = s;
		if (input.at(s) == '"') {
			s++;
			e++;
			while (e < l && input.at(e) != '"')
				e++;
		} else {
			while (e < l && input.at(e) != ' ')
				e++;
		}
		
		if (inc)
			m_require.append(input.mid(s, e - s));
		else if (exc)
			m_exclude.append(input.mid(s, e - s));
		else
			m_include.append(input.mid(s, e - s));
		
		s = e + 1;
	}
}


bool RegSearchList::matches(const QCString &text, bool cs) const
{
	RegCStrList::ConstIterator it;

	for (it = m_exclude.begin(); it != m_exclude.end(); ++it)
		if (text.contains(*it, cs))
			return false;

	for (it = m_require.begin(); it != m_require.end(); ++it)
		if (!text.contains(*it, cs))
			return false;

	if (m_include.isEmpty())
		return true;

	for (it = m_include.begin(); it != m_include.end(); ++it)
		if (text.contains(*it, cs))
			return true;

	return false;
}


bool RegSearchList::matchesFuzzy(const QCString &text, bool cs, int threshold) const
{
	RegCStrList::ConstIterator it;

	for (it = m_exclude.begin(); it != m_exclude.end(); ++it)
		if (stringContainsFuzzy(text, *it, cs, threshold))
			return false;

	for (it = m_require.begin(); it != m_require.end(); ++it)
		if (!stringContainsFuzzy(text, *it, cs, threshold))
			return false;

	if (m_include.isEmpty())
		return true;

	for (it = m_include.begin(); it != m_include.end(); ++it)
		if (stringContainsFuzzy(text, *it, cs, threshold))
			return true;

	return false;
}


bool RegSearchList::stringContainsFuzzy(const QCString &str, const QCString &substr, bool cs, int threshold) const
{
	char ngram[4];
	int ngramcount = substr.length() - 2;
	int i, count = 0;

	ngram[3] = '\0';

	for (i = 0; i < ngramcount; i++) {
		qstrncpy(ngram, &substr[i], 4);
		
		if (str.contains(ngram, cs))
			count++;
	}

	return (100 * count / ngramcount) >= threshold;
}




RegSearchNumList::RegSearchNumList()
{
}


void RegSearchNumList::setNumbers(const QString &text, bool year)
{
	int l, s, e, num;
	QCString input(text.latin1());
	bool ok;

	m_include.clear();

	input = input.simplifyWhiteSpace();
	l = input.length();

	s = 0;
	while (s < l) {
		if ((e = input.find(' ', s)) == -1)
			e = l;
		num = input.mid(s, e - s).toInt(&ok);
		if (ok) {
			if (year && num < 100)
				num += (num < 81) ? 2000 : 1900;
			m_include.append(num);
		}
		s = e + 1;
	}
}


bool RegSearchNumList::matches(int num) const
{
	RegNumList::ConstIterator it;

	if (m_include.isEmpty())
		return true;

	for (it = m_include.begin(); it != m_include.end(); ++it)
		if (num == *it)
			return true;

	return false;
}




RegEntry::RegEntry()
{
}


QCString RegEntry::lineStr(int num) const
{
	char buff[REG_LINE_SIZE];
	int i, l;

	qstrcpy((char *)buff, m_lines[num]);
	l = qstrlen((char *)buff);
	
	if ((l) && buff[l-1] == '\x0a')
		l--;
	if ((l) && buff[l-1] == '\x0d')
		l--;
	buff[l] = 0;

	for (i = 0; i < l; i++)
		buff[i] = ibm_to_latin1[(unsigned char)buff[i]];

	return QCString((char *)buff);
}


int RegEntry::page() const
{
	return atoi(m_lines[REG_LINE_PAGE]);
}


int RegEntry::edition() const
{
	return atoi(m_lines[REG_LINE_EDITION]);
}

	
unsigned char RegEntry::magazine() const
{
	if (*m_lines[REG_LINE_MAGYEAR] == 'c')
		return REG_MAGAZINE_CT;
	if (*m_lines[REG_LINE_MAGYEAR] == 'i')
		return REG_MAGAZINE_IX;
		
	return REG_MAGAZINE_NONE;
}


int RegEntry::year() const
{
	int y = atoi(&m_lines[REG_LINE_MAGYEAR][1]);

	return (y < 81) ? y + 2000 : y + 1900;
}


bool RegEntry::verify() const
{
	int i, len;

	// - Do all lines end with CR LF?
  for (i = 0; i < REG_LINE_COUNT; i++) {
  	len = qstrlen(m_lines[i]);
  	if ((len < 2) || (m_lines[i][len-2] != 0x0d) || (m_lines[i][len-1] != 0x0a))
  		return false;
  }
	
	// - Does line 5 (page number) only contain blanks and digits?
	if (qstrlen(m_lines[4]) != 5)
		return false;
	for (i = 0; i < 3; i++)
		if ((m_lines[REG_LINE_PAGE][i] != ' ') && ((m_lines[REG_LINE_PAGE][i] < '0') || (m_lines[REG_LINE_PAGE][i] > '9')))
			return false;
	
	// - Does line 6 (edition) only contain blanks and digits?
	if (qstrlen(m_lines[REG_LINE_EDITION]) != 4)
		return false;
	for (i = 0; i < 2; i++)
		if ((m_lines[REG_LINE_EDITION][i] != ' ') && ((m_lines[REG_LINE_EDITION][i] < '0') || (m_lines[REG_LINE_EDITION][i] > '9')))
			return false;
	
  // - Does line 7 (magazine/year) only contain the magazine's id and digits?
  if (qstrlen(m_lines[REG_LINE_MAGYEAR]) != 5)
  	return false;
  if ((m_lines[REG_LINE_MAGYEAR][0] != 'c') && (m_lines[REG_LINE_MAGYEAR][0] != 'i'))
  	return false;
	if ((m_lines[REG_LINE_MAGYEAR][1] < '0') || (m_lines[REG_LINE_MAGYEAR][1] > '9') || (m_lines[REG_LINE_MAGYEAR][2] < '0') || (m_lines[REG_LINE_MAGYEAR][2] > '9'))
  	return false;

	return true;
}



RegMask::RegMask()
{
	m_magazines = REG_MAGAZINE_NONE;
	m_cs = false;
	m_fuzzy = false;
	m_threshold = 100;
}


void RegMask::setBywords(const QString &text)
{
	m_bywords.setKeyWords(text);
}


void RegMask::setAuthors(const QString &text)
{
	m_authors.setKeyWords(text);
}


void RegMask::setEditors(const QString &text)
{
	m_editors.setKeyWords(text);
}


void RegMask::setEditions(const QString &text)
{
	m_editions.setNumbers(text, false);
}


void RegMask::setYears(const QString &text)
{
	m_years.setNumbers(text, true);
}


bool RegMask::matches(const RegEntry &entry) const
{
	if (!(m_magazines & entry.magazine()))
		return false;

	if (!m_editions.matches(entry.edition()))
		return false;

	if (!m_years.matches(entry.year()))
		return false;

	if (m_fuzzy) {
		if (!(m_bywords.matchesFuzzy(entry.bywords(), m_cs, m_threshold) || m_bywords.matchesFuzzy(entry.title(), m_cs, m_threshold) || m_bywords.matchesFuzzy(entry.subTitle(), m_cs, m_threshold)))
			return false;
		if (!m_authors.matchesFuzzy(entry.author(), m_cs, m_threshold))
			return false;
		if (!m_editors.matchesFuzzy(entry.editor(), m_cs, m_threshold))
			return false;
	} else {
		if (!(m_bywords.matches(entry.bywords(), m_cs) || m_bywords.matches(entry.title(), m_cs) || m_bywords.matches(entry.subTitle(), m_cs)))
			return false;
		if (!m_authors.matches(entry.author(), m_cs))
			return false;
		if (!m_editors.matches(entry.editor(), m_cs))
			return false;
	}

	return true;
}

	
	
RegFile::RegFile(): QFile()
{
}


RegFile::RegFile(const QString &name): QFile(name)
{
	ct_first_ed = 999999;
	ct_last_ed = 0;
	ct_articles = 0;
	ix_first_ed = 999999;
	ix_last_ed = 0;
	ix_articles = 0;
}


RegFile::~RegFile()
{
}


bool RegFile::readEntry(RegEntry *entry)
{
	int i;

	for (i = 0; i < REG_LINE_COUNT; i++) {
		if (readLine(entry->line(i), REG_LINE_SIZE) == -1)
			return false;
	}

	return true;
}


bool RegFile::scanEntries()
{
	RegEntry entry;
	int edition;
		
	ct_first_ed = 999999;
	ct_last_ed = 0;
	ct_articles = 0;
	ct_editions.clear();
	ix_first_ed = 999999;
	ix_last_ed = 0;
	ix_articles = 0;
	ix_editions.clear();
	
	at(0);

	while (!atEnd()) {
		if (!readEntry(&entry))
			return false;
		if (!entry.verify())
			return false;
		edition = entry.year() * 100 + entry.edition();
	  if (entry.magazine() == REG_MAGAZINE_CT) {
	  	if (ct_first_ed > edition)
	  		ct_first_ed = edition;
	  	if (ct_last_ed < edition)
	  		ct_last_ed = edition;
	  	ct_articles++;
	  	if (!ct_editions.contains(edition))
	  		ct_editions.append(edition);
	  } else if (entry.magazine() == REG_MAGAZINE_IX) {
	  	if (ix_first_ed > edition)
	  		ix_first_ed = edition;
	  	if (ix_last_ed < edition)
	  		ix_last_ed = edition;
	  	ix_articles++;
	  	if (!ix_editions.contains(edition))
	  		ix_editions.append(edition);
	  }
	}

	at(0);

	return true;
}


int RegFile::firstEdition(unsigned char mag) const
{
	if (mag & REG_MAGAZINE_CT)
		return (ct_first_ed == 999999) ? 0 : ct_first_ed;
	else
		return (ix_first_ed == 999999) ? 0 : ix_first_ed;
}


int RegFile::lastEdition(unsigned char mag) const
{
	return (mag & REG_MAGAZINE_CT) ? ct_last_ed : ix_last_ed;
}


int RegFile::articles(unsigned char mag) const
{
	return (mag & REG_MAGAZINE_CT) ? ct_articles : ix_articles;
}


bool RegFile::containsEditions(unsigned char mag, const RegNumList &editions) const
{
	RegNumList::ConstIterator it;

	if (mag & REG_MAGAZINE_CT) {
		for (it = ct_editions.begin(); it != ct_editions.end(); ++it)
			if (editions.contains(*it))
				return true;
	}
	if (mag & REG_MAGAZINE_IX) {
		for (it = ix_editions.begin(); it != ix_editions.end(); ++it)
			if (editions.contains(*it))
				return true;
	}

	return false;
}
