@inproceedings{25a8f40b9fc64cc7ab3a822ff620c2de,
title = "Identification of deliberately doctored text documents using Frequent Keyword Chain (FKC) model",
abstract = "Text documents have always been the most dominant source of data available. A number of classification techniques are used to organize these documents and a majority of these classification algorithms use keywords to categorize them. It is possible to mislead such algorithms by inserting keywords ('deliberate doctoring') belonging to a class different from that of the document. Such intentional deception is done in order to rank web pages higher in searches. As text classification is used to classify e-mails, deliberate doctoring is also done as a spam filter-busting measure. In addition it may be practiced to avoid detection by security agencies. The cost of such misclassification can be high and it is a serious problem in many scenarios. In this paper we have exhaustively examined the possible methods to doctor a document which may lead to its misclassification. In the study we have concluded that a majority of the ways would involve insertion of a number of misleading keywords in close proximity. We propose the Frequent Keyword Chain model to identify such local concentration of keywords. A tool called the FKCLocater is designed around the model which identifies and highlights FKC's in a document and alerts the user to the possibility of misclassification. The tool is also used to specify various parameters to fine tune the Frequency Keyword Chain model. Experiments on Newsgroup data sets show that this model is effective.",
keywords = "Doctored document detection, Frequent keywords, Text document classification",
author = "Siddharth Kaza and Murthy, {S. N.Jayaram} and Gongzhu Hu",
note = "Publisher Copyright: {\textcopyright} 2003 IEEE.; IEEE International Conference on Information Reuse and Integration, IRI 2003 ; Conference date: 27-10-2003 Through 29-10-2003",
year = "2003",
doi = "10.1109/IRI.2003.1251443",
language = "English",
series = "Proceedings of the 2003 IEEE International Conference on Information Reuse and Integration, IRI 2003",
publisher = "Institute of Electrical and Electronics Engineers Inc.",
pages = "398--405",
editor = "Smari, {Waleed W.} and Memon, {Atif M.}",
booktitle = "Proceedings of the 2003 IEEE International Conference on Information Reuse and Integration, IRI 2003",
}