- "Coupled Semi-Supervised Learning," A. Carlson, PhD Thesis, 2010.
[bib]
[supplementary online materials]
@phdthesis{carlson2010,
Title = {Coupled Semi-Supervised Learning},
Author = {Andrew Carlson},
School = {Carnegie Mellon University},
Year = {2010}}
- "Toward an Architecture for Never-Ending Language Learning," A. Carlson, J. Betteridge, B. Kisiel, B. Settles, E. R. Hruschka Jr. and T. M. Mitchell,
To appear in the Proceedings of the Twenty-Fourth Conference on Artificial Intelligence (AAAI 2010), 2010.
[abstract]
[bib]
[supplementary online materials]
We consider here the problem of building a never-ending language learner; that
is, an intelligent computer agent that runs forever and that each day must (1)
extract, or read, information from the web to populate a growing structured
knowledge base, and (2) learn to perform this task better than on the
previous day. In particular, we propose an approach and a set of design
principles for such an agent, describe a partial implementation of such a
system that has already learned to extract a knowledge base containing over
242,000 beliefs with an estimated precision of 74%, and discuss lessons
learned from this preliminary attempt to build a never-ending learning agent.
@inproceedings{carlson-aaai,
Title = {Toward an Architecture for Never-Ending Language Learning},
Author = {Andrew Carlson and Justin Betteridge and Bryan Kisiel and Burr Settles and Estevam R. Hruschka Jr. and Tom M. Mitchell},
Booktitle = {Proceedings of the Twenty-Fourth Conference on Artificial Intelligence (AAAI 2010)},
Year = {2010}}
- "Coupled Semi-Supervised Learning for Information Extraction," A. Carlson, J. Betteridge, R. C. Wang, E. R. Hruschka Jr. and T. M. Mitchell,
Proceedings of the Third ACM International Conference on
Web Search and Data Mining (WSDM), 2010.
[abstract]
[bib]
[supplementary online materials]
[video of the talk]
We consider the problem of semi-supervised learning to extract categories (e.g., academicFields, athletes) and relations (e.g., PlaysSport(athlete, sport)) from web pages, starting with a handful of labeled training examples of each category or relation, plus hundreds of millions of unlabeled web documents. Semi-supervised training using only a few labeled examples is typically unreliable because the learning task is underconstrained. This paper pursues the thesis that much greater accuracy can be achieved by further constraining the learning task, by coupling the semi-supervised training of many extractors for different categories and relations. We characterize several ways in which the training of category and relation extractors can be coupled, and present experimental results demonstrating significantly improved accuracy as a result.
@inproceedings{carlson-wsdm,
Title = {Coupled Semi-Supervised Learning for Information Extraction},
Author = {Andrew Carlson and Justin Betteridge and Richard C. Wang and Estevam R. Hruschka Jr. and Tom M. Mitchell},
Booktitle = {Proceedings of the Third ACM International Conference on Web Search and Data Mining (WSDM 2010)},
Year = {2010}}
- "Populating the Semantic Web by Macro-Reading Internet Text," T. M. Mitchell, J. Betteridge, A. Carlson, E. R. Hruschka Jr., and R. C. Wang, Invited paper, Proceedings of the 8th International Semantic Web Conference (ISWC 2009), 2009.
[abstract]
[bib]
A key question regarding the future of the semantic web is "how will we acquire structured information to populate the semantic web on a vast scale?" One approach is to enter this information manually. A second approach is to take advantage of pre-existing databases, and to develop common ontologies, publishing standards, and reward systems to make this data widely accessible. We consider here a third approach: developing software that automatically extracts structured information from unstructured text present on the web. We also describe preliminary results demonstrating that machine learning algorithms can learn to extract tens of thousands of facts to populate a diverse ontology, with imperfect but reasonably good accuracy.
@inproceedings{mitchell-iswc09,
Title = {Populating the Semantic Web by Macro-Reading Internet Text},
Author = {Tom M. Mitchell and Justin Betteridge and Andrew Carlson and Estevam R. Hruschka Jr. and Richard C. Wang},
Booktitle = {Proceedings of the 8th International Semantic Web Conference (ISWC 2009)},
Year = {2009}}
- "Coupling Semi-Supervised Learning of Categories and Relations," A. Carlson, J. Betteridge, E. R. Hruschka Jr. and T. M. Mitchell,
Proceedings of the NAACL HLT 2009 Workshop on Semi-supervised Learning for Natural Language Processing, 2009.
[abstract]
[bib]
We consider semi-supervised learning of information extraction methods, especially for extracting instances of noun categories (e.g., athlete, team) and relations (e.g., playsForTeam(athlete,team)). Semisupervised approaches using a small number of labeled examples together with many unlabeled examples are often unreliable as they frequently produce an internally consistent, but nevertheless incorrect set of extractions. We propose that this problem can be overcome by simultaneously learning classifiers for many different categories and relations in the presence of an ontology defining constraints that couple the training of these classifiers. Experimental results show that simultaneously learning a coupled collection of classifiers for 30 categories and relations results in much more accurate extractions than training classifiers individually.
@inproceedings{carlson-sslnlp09,
Title = {Coupling Semi-Supervised Learning of Categories and Relations},
Author = {Andrew Carlson and Justin Betteridge and Estevam R. Hruschka Jr. and Tom M. Mitchell},
Booktitle = {Proceedings of the NAACL HLT 2009 Workskop on Semi-supervised Learning for Natural Language Processing},
Year = {2009}}
- "Toward Never Ending Language Learning," J. Betteridge, A. Carlson, S. A. Hong, E. R. Hruschka Jr., E. L. M. Law, T. M. Mitchell, S. H. Wang,
Proceedings of the 2009 AAAI Spring Symposium on Learning by Reading and Learning to Read, 2009.
[abstract]
[bib]
We report research toward a never-ending language learning system, focusing on a first implementation which learns to classify occurrences of noun phrases according to lexical categories such as city and university. Our experiments suggest that the accuracy of classifiers produced by semi-supervised learning can be improved by coupling the learning of multiple classes based on background knowledge about relationships between the classes (e.g., university is mutually exclusive of company, and is a subset of organization).
@inproceedings{mitchell-lbr09a,
Title = {Toward Never Ending Language Learning},
Author = {Justin Betteridge and Andrew Carlson and Sue A. Hong and Estevam R. Hruschka Jr. and Edith L. M. Law and Tom M. Mitchell and Sophie H. Wang},
Booktitle = {Proceedings of the 2009 AAAI Spring Symposium on Learning by Reading and Learning to Read},
Year = {2009}}
- "Learning a Named Entity Tagger from Gazetteers with the Partial Perceptron", A. Carlson, S. Gaffney, and F. Vasile, Proceedings of the 2009 AAAI Spring Symposium on Learning by Reading and Learning to Read, 2009.
[abstract]
[bib]
While gazetteers can be used to perform named entity recognition through lookup-based methods, ambiguity and incomplete gazetteers lead to relatively low recall. A sequence model which uses more general features can achieve higher recall while maintaining reasonable precision, but typically requires expensive annotated training data. To circumvent the need for such training data, we bootstrap the learning of a sequence model with a gazetteer-driven labeling algorithm which only labels tokens in unlabeled data that it can label confidently. We present an algorithm, called the Partial Perceptron, for discriminatively learning the parameters of a sequence model from such partially labeled data. The algorithm is easy to implement and trains much more quickly than a state-of-the-art algorithm based on Conditional Random Fields with equivalent performance. Experimental results show that the learned model yields a substantial relative improvement in recall (77.3%) with some loss in precision (a 28.7% relative decrease) when compared to the gazetteer-driven method.
@inproceedings{carlson-lbr09,
Title = {Learning a Named Entity Tagger from Gazetteers with the Partial Perceptron},
Author = {Andrew Carlson and Scott Gaffney and Flavian Vasile},
Booktitle = {Proceedings of the 2009 AAAI Spring Symposium on Learning by Reading and Learning to Read},
Year = {2009}}
- "Bootstrapping Information Extraction from Semi-structured Web Pages," A. Carlson and C. Schafer, Proceedings of the European Conference on Machine Learning and Principles and Practice of Knowledge Discovery in Databases (ECML/PKDD), 2008.
[abstract]
[bib]
We consider the problem of extracting structured records from semi-structured web pages with no human supervision required for each target web site. Previous work on this problem has either required significant human effort for each target site or used brittle heuristics to identify semantic data types. Our method only requires annotation for a few pages from a few sites in the target domain. Thus, after a tiny investment of human effort, our method allows automatic extraction from potentially thousands of other sites within the same domain. Our approach extends previous methods for detecting data fields in semi-structured web pages by matching those fields to domain schema columns using robust models of data values and contexts. Annotating 25 pages for 46 web sites yields an extraction accuracy of 83.8% on job offer sites and 91.1% on vacation rental sites. These results significantly outperform a baseline approach.
@inproceedings{carlson-ecml08,
Title = {Bootstrapping Information Extraction from Semi-structured Web Pages},
Author = {Andrew Carlson and Charles Schafer},
Booktitle = {Proceedings of the European Conference on Machine Learning and Principles and Practice of Knowledge Discovery in Databases (ECML/PKDD 2008)},
Year = {2008}}
- "Predicting Human Brain Activity Associated with the Meanings of Nouns," T. M. Mitchell, S. V. Shinkareva, A. Carlson, K.M. Chang, V. L. Malave, R. A. Mason, and M. A. Just, Science 320, May 30, 2008, 1161-1165. [supporting online material] [supporting website]
[abstract]
[bib]
The question of how the human brain represents conceptual knowledge has been debated in many scientific fields. Brain imaging studies have shown that different spatial patterns of neural activation are associated with thinking about different semantic categories of pictures and words (for example, tools, buildings, and animals). We present a computational model that predicts the functional magnetic resonance imaging (fMRI) neural activation associated with words for which fMRI data are not yet available. This model is trained with a combination of data from a trillion-word text corpus and observed fMRI data associated with viewing several dozen concrete nouns. Once trained, the model predicts fMRI activation for thousands of other concrete nouns in the text corpus, with highly significant accuracies over the 60 nouns for which we currently have fMRI data.
@article{mitchell-science2008,
author = {Mitchell, Tom M. and Shinkareva, Svetlana V. and Carlson, Andrew and Chang, Kai-Min and Malave, Vicente L. and Mason, Robert A. and Just, Marcel A. },
journal = {Science},
month = {May},
number = {5880},
pages = {1191--1195},
title = {Predicting Human Brain Activity Associated with the Meanings of Nouns},
volume = {320},
year = {2008}}
- "On the Chance Accuracies of Large Collections of Classifiers," M. Palatucci and A. Carlson, Proceedings of the 25th International Conference on Machine Learning (ICML), 2008.
[abstract]
[bib]
We provide a theoretical analysis of the chance accuracies of large collections of classifiers. We show that on problems with small numbers of examples, some classifier can perform well by random chance, and we derive a theorem to explicitly calculate this accuracy.
We use this theorem to provide a principled feature selection criterion for sparse, high-dimensional problems. We evaluate this method on microarray and fMRI datasets and show that it performs very close to the optimal accuracy obtained from an oracle. We also show that on the fMRI dataset this technique chooses relevant features successfully while another state-of-the-art method, the False Discovery Rate (FDR), completely fails at standard significance levels.
@inproceedings{palatucci-icml08,
author = {Mark Palatucci and Andrew Carlson},
title = {On the Chance Accuracies of Large Collections of Classifiers},
booktitle = {Proceedings of the 25th International Conference on Machine Learning},
month = {July},
year = {2008}}
- "Memory-Based Context-Sensitive Spelling Correction at Web Scale," A. Carlson and I. Fette, Proceedings of the IEEE International Conference on Machine Learning and Applications (ICMLA), 2007.
[abstract]
[bib]
We study the problem of correcting spelling mistakes in
text using memory-based learning techniques and a very
large database of token n-gram occurrences in web text as
training data. Our approach uses the context in which an
error appears to select the most likely candidate from words
which might have been intended in its place. Using a novel
correction algorithm and a massive database of training
data, we demonstrate higher accuracy on correcting realword
errors than previous work, and very high accuracy at
a new task of ranking corrections to non-word errors given
by a standard spelling correction package.
@inproceedings{carlson-icmla07,
author = {Andrew Carlson and Ian Fette},
title = {Memory-Based Context-Sensitive Spelling Correction at Web Scale},
booktitle = {Proceedings of the IEEE International Conference on Machine Learning and Applications (ICMLA)},
year = {2007}}
- "Scaling Up Context Sensitive Text Correction," A. Carlson, J. Rosen, and D. Roth, Proceedings of the The Thirteenth Conference on Innovative Applications of Artificial Intelligence (IAAI), 2001.
[abstract]
[bib]
The main challenge in an effort to build a realistic system with
context-sensitive inference capabilities, beyond accuracy, is
scalability. This paper studies this problem in the context
of a learning-based approach to context sensitive text correction
the task of fixing spelling errors that result in valid
words, such as substituting to for too, casual for causal, and
so on. Research papers on this problem have developed algorithms
that can achieve fairly high accuracy, in many cases
over 90%. However, this level of performance is not sufficient
for a large coverage practical system since it implies a
low sentence level performance.
We examine and offer solutions to several issues relating to
scaling up a context sensitive text correction system. In particular,
we suggest methods to reduce the memory requirements
while maintaining a high level of performance and
show that this can still allow the system to adapt to new domains.
Most important, we show how to significantly increase
the coverage of the system to realistic levels, while
providing a very high level of performance, at the 99% level.
@inproceedings{carlson-iaai01,
author = {Andrew Carlson and Jeffrey Rosen and Dan Roth},
title = {Scaling Up Context-Sensitive Text Correction},
booktitle = {Proceedings of the The Thirteenth Conference on Innovative Applications of Artificial Intelligence (IAAI 2001)},
year = {2001}}
- "The SNoW Learning Architecture," A. Carlson, C. Cumby, J. Rosen, and D. Roth (1999). Technical Report UIUCDCS-R-99-2101, UIUC Computer Science Department, May, 1999. (latest version)