.\" Automatically generated by Pod::Man 4.11 (Pod::Simple 3.35)
.\"
.\" Standard preamble:
.\" ========================================================================
.de Sp \" Vertical space (when we can't use .PP)
.if t .sp .5v
.if n .sp
..
.de Vb \" Begin verbatim text
.ft CW
.nf
.ne \\$1
..
.de Ve \" End verbatim text
.ft R
.fi
..
.\" Set up some character translations and predefined strings.  \*(-- will
.\" give an unbreakable dash, \*(PI will give pi, \*(L" will give a left
.\" double quote, and \*(R" will give a right double quote.  \*(C+ will
.\" give a nicer C++.  Capital omega is used to do unbreakable dashes and
.\" therefore won't be available.  \*(C` and \*(C' expand to `' in nroff,
.\" nothing in troff, for use with C<>.
.tr \(*W-
.ds C+ C\v'-.1v'\h'-1p'\s-2+\h'-1p'+\s0\v'.1v'\h'-1p'
.ie n \{\
.    ds -- \(*W-
.    ds PI pi
.    if (\n(.H=4u)&(1m=24u) .ds -- \(*W\h'-12u'\(*W\h'-12u'-\" diablo 10 pitch
.    if (\n(.H=4u)&(1m=20u) .ds -- \(*W\h'-12u'\(*W\h'-8u'-\"  diablo 12 pitch
.    ds L" ""
.    ds R" ""
.    ds C` ""
.    ds C' ""
'br\}
.el\{\
.    ds -- \|\(em\|
.    ds PI \(*p
.    ds L" ``
.    ds R" ''
.    ds C`
.    ds C'
'br\}
.\"
.\" Escape single quotes in literal strings from groff's Unicode transform.
.ie \n(.g .ds Aq \(aq
.el       .ds Aq '
.\"
.\" If the F register is >0, we'll generate index entries on stderr for
.\" titles (.TH), headers (.SH), subsections (.SS), items (.Ip), and index
.\" entries marked with X<> in POD.  Of course, you'll have to process the
.\" output yourself in some meaningful fashion.
.\"
.\" Avoid warning from groff about undefined register 'F'.
.de IX
..
.nr rF 0
.if \n(.g .if rF .nr rF 1
.if (\n(rF:(\n(.g==0)) \{\
.    if \nF \{\
.        de IX
.        tm Index:\\$1\t\\n%\t"\\$2"
..
.        if !\nF==2 \{\
.            nr % 0
.            nr F 2
.        \}
.    \}
.\}
.rr rF
.\"
.\" Accent mark definitions (@(#)ms.acc 1.5 88/02/08 SMI; from UCB 4.2).
.\" Fear.  Run.  Save yourself.  No user-serviceable parts.
.    \" fudge factors for nroff and troff
.if n \{\
.    ds #H 0
.    ds #V .8m
.    ds #F .3m
.    ds #[ \f1
.    ds #] \fP
.\}
.if t \{\
.    ds #H ((1u-(\\\\n(.fu%2u))*.13m)
.    ds #V .6m
.    ds #F 0
.    ds #[ \&
.    ds #] \&
.\}
.    \" simple accents for nroff and troff
.if n \{\
.    ds ' \&
.    ds ` \&
.    ds ^ \&
.    ds , \&
.    ds ~ ~
.    ds /
.\}
.if t \{\
.    ds ' \\k:\h'-(\\n(.wu*8/10-\*(#H)'\'\h"|\\n:u"
.    ds ` \\k:\h'-(\\n(.wu*8/10-\*(#H)'\`\h'|\\n:u'
.    ds ^ \\k:\h'-(\\n(.wu*10/11-\*(#H)'^\h'|\\n:u'
.    ds , \\k:\h'-(\\n(.wu*8/10)',\h'|\\n:u'
.    ds ~ \\k:\h'-(\\n(.wu-\*(#H-.1m)'~\h'|\\n:u'
.    ds / \\k:\h'-(\\n(.wu*8/10-\*(#H)'\z\(sl\h'|\\n:u'
.\}
.    \" troff and (daisy-wheel) nroff accents
.ds : \\k:\h'-(\\n(.wu*8/10-\*(#H+.1m+\*(#F)'\v'-\*(#V'\z.\h'.2m+\*(#F'.\h'|\\n:u'\v'\*(#V'
.ds 8 \h'\*(#H'\(*b\h'-\*(#H'
.ds o \\k:\h'-(\\n(.wu+\w'\(de'u-\*(#H)/2u'\v'-.3n'\*(#[\z\(de\v'.3n'\h'|\\n:u'\*(#]
.ds d- \h'\*(#H'\(pd\h'-\w'~'u'\v'-.25m'\f2\(hy\fP\v'.25m'\h'-\*(#H'
.ds D- D\\k:\h'-\w'D'u'\v'-.11m'\z\(hy\v'.11m'\h'|\\n:u'
.ds th \*(#[\v'.3m'\s+1I\s-1\v'-.3m'\h'-(\w'I'u*2/3)'\s-1o\s+1\*(#]
.ds Th \*(#[\s+2I\s-2\h'-\w'I'u*3/5'\v'-.3m'o\v'.3m'\*(#]
.ds ae a\h'-(\w'a'u*4/10)'e
.ds Ae A\h'-(\w'A'u*4/10)'E
.    \" corrections for vroff
.if v .ds ~ \\k:\h'-(\\n(.wu*9/10-\*(#H)'\s-2\u~\d\s+2\h'|\\n:u'
.if v .ds ^ \\k:\h'-(\\n(.wu*10/11-\*(#H)'\v'-.4m'^\v'.4m'\h'|\\n:u'
.    \" for low resolution devices (crt and lpr)
.if \n(.H>23 .if \n(.V>19 \
\{\
.    ds : e
.    ds 8 ss
.    ds o a
.    ds d- d\h'-1'\(ga
.    ds D- D\h'-1'\(hy
.    ds th \o'bp'
.    ds Th \o'LP'
.    ds ae ae
.    ds Ae AE
.\}
.rm #[ #] #H #V #F C
.\" ========================================================================
.\"
.IX Title "LSM 1"
.TH LSM 1 "2024-05-10" "1.0" "Latent Semantic Mapping"
.\" For nroff, turn off justification.  Always turn off hyphenation; it makes
.\" way too many mistakes in technical documents.
.if n .ad l
.nh
.SH "NAME"
lsm \- Latent Semantic Mapping tool
.SH "SYNOPSIS"
.IX Header "SYNOPSIS"
lsm \fIlsm_command\fR [\fIcommand_options\fR] \fImap_file\fR [\fIinput_files\fR]
.SH "DESCRIPTION"
.IX Header "DESCRIPTION"
The Latent Semantic Mapping framework is a language independent,
Unicode based technology that builds \fImaps\fR and uses them to classify
\&\fItexts\fR into one of a number of \fIcategories\fR.
.PP
\&\fBlsm\fR is a tool to create, manipulate, test, and dump Latent Semantic
Mapping maps. It is designed to provide access to a large subset of
the functionality of the Latent Semantic Mapping \s-1API,\s0 mainly for rapid
prototyping and diagnostic purposes, but possibly also for simple
shell script based applications of Latent Semantic Mapping.
.SH "COMMANDS"
.IX Header "COMMANDS"
\&\fBlsm\fR provides a variety of commands (\fIlsm_command\fR in the
Synopsis), each of which often has a wealth of options (see the
Command Options below). Command names may be abbreviated to
unambiguous prefixes.
.IP "\fBlsm create\fR \fImap_file\fR \fIinput_files\fR" 4
.IX Item "lsm create map_file input_files"
Create a new \s-1LSM\s0 map from the specified \fIinput_files\fR.
.IP "\fBlsm update\fR \fImap_file\fR \fIinput_files\fR" 4
.IX Item "lsm update map_file input_files"
Add the specified \fIinput_files\fR to an existing \s-1LSM\s0 map.
.IP "\fBlsm evaluate\fR \fImap_file\fR \fIinput_files\fR" 4
.IX Item "lsm evaluate map_file input_files"
Classify the specified \fIinput_files\fR into the categories of the \s-1LSM\s0 map.
.IP "\fBlsm cluster\fR \fB[\-\-k\-means=N | \-\-agglomerative=N] [\-\-apply]\fR" 4
.IX Item "lsm cluster [--k-means=N | --agglomerative=N] [--apply]"
Compute clusters for the map, and, if the \fB\-\-apply\fR option is specified, 
transform the map accordingly. Multiple levels of clustering may be applied for
faster performance on large maps, e.g.
.Sp
.Vb 1
\&   lsm cluster \-\-k\-means=100 \-\-each \-\-agglomerative=100 \-\-agglomerative=1000 my.map
.Ve
.Sp
first computes 100 clusters using (fast) k\-means clustering, computes 100 subclusters 
for each first stage cluster using agglomerative clustering, and finally reduces those
10000 clusters to 1000 using agglomerative clustering.
.IP "\fBlsm dump\fR \fImap_file\fR [\fIinput_files\fR]" 4
.IX Item "lsm dump map_file [input_files]"
Without \fIinput_files\fR, dumps all words in the map with their
counts. With \fIinput_files\fR, dump, for each file, the words that
appear in the map, their counts in the map, and their relative
frequencies in the input file.
.IP "\fBlsm info\fR \fImap_file\fR" 4
.IX Item "lsm info map_file"
Bypass the Latent Semantic Mapping framework to extract and print
information about the file and perform a number of consistency checks
on it. \fB(\s-1NOT IMPLEMENTED YET\s0)\fR
.SH "COMMAND OPTIONS"
.IX Header "COMMAND OPTIONS"
This section describes the \fIcommand_options\fR that are available for
the \fBlsm\fR commands. Not all commands support all of these options;
each option is only supported for commands where it makes sense.
However, when a command has one of these options you can count on the
same meaning for the option as in other commands.
.IP "\fB\-\-append\-categories\fR" 4
.IX Item "--append-categories"
Directs the \fBupdate\fR command to put the data into new categories appended 
after the existing ones, instead of adding the data to the existing categories.
.IP "\fB\-\-categories\fR \fIcount\fR" 4
.IX Item "--categories count"
Directs the \fBevaluate\fR command to only list the top \fIcount\fR categories.
.IP "\fB\-\-category\-delimiter\fR \fIdelimiter\fR" 4
.IX Item "--category-delimiter delimiter"
Specify the delimiter to be used to between categories in the
\&\fIinput_files\fR passed to the \fBcreate\fR and \fBupdate\fR commands.
.RS 4
.IP "\fBgroup\fR" 8
.IX Item "group"
Categories are separated by a `;' argument.
.IP "\fBfile\fR" 8
.IX Item "file"
Each \fIinput_file\fR represents a separate category. This is the default
if the \fB\-\-category\-delimiter\fR option is not given.
.IP "\fBline\fR" 8
.IX Item "line"
Each line represents a separate category.
.IP "\fIstring\fR" 8
.IX Item "string"
Categories are separated by the specified \fIstring\fR.
.RE
.RS 4
.RE
.IP "\fB\-\-clobber\fR" 4
.IX Item "--clobber"
When creating a map, overwrite an existing file at the path, even if it's not an \s-1LSM\s0 map.
By default, \fBcreate\fR will only overwrite an existing file if it's believed to be an \s-1LSM\s0 
map, which guards against frequent operator errors such as:
.Sp
.Vb 1
\&   lsm create /usr/include/*.h
.Ve
.IP "\fB\-\-dimensions\fR \fIdim\fR" 4
.IX Item "--dimensions dim"
Direct the \fBcreate\fR and \fBupdate\fR commands to use the given number of
dimensions for computing the map (Defaults to the number of
categories). This option is useful to manage the size and
computational overhead of maps with large number of categories.
.IP "\fB\-\-discard\-counts\fR" 4
.IX Item "--discard-counts"
Direct the \fBcreate\fR and \fBupdate\fR commands to omit the raw word / token 
counts when writing the map. This results in a map that is more compact, 
but cannot be updated any further.
.IP "\fB\-\-hash\fR" 4
.IX Item "--hash"
Direct the \fBcreate\fR and \fBupdate\fR commands to write the map in a
format that is not human readable with default file manipulation tools
like \fBcat\fR or \fBhexdump\fR. This is useful in applications such as junk
mail filtering, where input data may contain naughty words and where
the contents of the map may tip off spammers what words to avoid.
.IP "\fB\-\-help\fR" 4
.IX Item "--help"
List an overview of the options available for a command. Available for
all commands.
.IP "\fB\-\-html\fR" 4
.IX Item "--html"
Strip \s-1HTML\s0 codes from the \fIinput_files\fR. Useful for mail and web
input. Available for the \fBcreate\fR, \fBupdate\fR, \fBevaluate\fR, and
\&\fBdump\fR commands.
.IP "\fB\-\-junk\-mail\fR" 4
.IX Item "--junk-mail"
When parsing the input files, apply heuristics to counteract common
methods used by spammers to disguise incriminating words such as:
.Sp
.Vb 3
\&   Zer0 1nt3rest l0ans     Substituting letters with digits
\&   W E A L T H             Adding spaces between letters
\&   m.o.r.t.g.a.g.e         Adding punctuation between letters
.Ve
.Sp
Available for the \fBcreate\fR, \fBupdate\fR, \fBevaluate\fR, and
\&\fBdump\fR commands.
.IP "\fB\-\-pairs\fR" 4
.IX Item "--pairs"
If specified with the \fBcreate\fR command when building the map, store
counts for pairs of words as well as the words themselves. This can
increase accuracy for certain classes of problems, but will generate
unreasonably large maps unless the vocabulary is fairly limited.
.IP "\fB\-\-stop\-words\fR \fIstop_word_file\fR" 4
.IX Item "--stop-words stop_word_file"
If specified with the \fBcreate\fR command, \fIstop_word_file\fR is parsed
and all words found are excluded from texts evaluated against the
map. This is useful for excluding frequent, semantically meaningless
words.
.IP "\fB\-\-sweep\-cutoff\fR \fIthreshold\fR" 4
.IX Item "--sweep-cutoff threshold"
.PD 0
.IP "\fB\-\-sweep\-frequency\fR \fIdays\fR" 4
.IX Item "--sweep-frequency days"
.PD
Available for the \fBcreate\fR and \fBupdate\fR commands. Every specified
number of \fIdays\fR (by default 7), scan the map and remove from it any
entries that have been in the map for at least 2 previous scans and
whose total counts are smaller than \fIthreshold\fR.  \fIthreshold\fR
defaults to 0, so by default the map is not scanned.
.IP "\fB\-\-text\-delimiter\fR \fIdelimiter\fR" 4
.IX Item "--text-delimiter delimiter"
Specify the delimiter to be used to between texts in the
\&\fIinput_files\fR passed to the \fBcreate\fR, \fBupdate\fR, \fBevaluate\fR, and
\&\fBdump\fR commands.
.RS 4
.IP "\fBfile\fR" 8
.IX Item "file"
Each \fIinput_file\fR represents a separate text. This is the default
if the \fB\-\-text\-delimiter\fR option is not given.
.IP "\fBline\fR" 8
.IX Item "line"
Each line represents a separate text.
.IP "\fIstring\fR" 8
.IX Item "string"
Texts are separated by the specified \fIstring\fR.
.RE
.RS 4
.RE
.IP "\fB\-\-triplets\fR" 4
.IX Item "--triplets"
If specified with the \fBcreate\fR command when building the map, store
counts for triplets and pairs of words as well as the words
themselves. This can increase accuracy for certain classes of
problems, but will generate unreasonably large maps unless the
vocabulary is fairly limited.
.IP "\fB\-\-weight\fR \fIweight\fR" 4
.IX Item "--weight weight"
Scale counts of input words for the \fBcreate\fR and \fBupdate\fR commands
by the specified \fIweight\fR, which may be a positive or negative
floating point number.
.IP "\fB\-\-words\fR" 4
.IX Item "--words"
Directs the \fBevaluate\fR or \fBcluster\fR commands to apply to words, instead of categories.
.IP "\fB\-\-words\fR=\fIcount\fR" 4
.IX Item "--words=count"
Directs the \fBevaluate\fR command to list the top \fIcount\fR words, instead of categories.
.SH "EXAMPLES"
.IX Header "EXAMPLES"
.ie n .IP """lsm evaluate \-\-html \-\-junk\-mail ~/Library/Mail/V2/MailData/LSMMap2 msg*.txt""" 4
.el .IP "\f(CWlsm evaluate \-\-html \-\-junk\-mail ~/Library/Mail/V2/MailData/LSMMap2 msg*.txt\fR" 4
.IX Item "lsm evaluate --html --junk-mail ~/Library/Mail/V2/MailData/LSMMap2 msg*.txt"
Simulate the \fBMail.app\fR junk mail filter by evaluating the specified
files (assumed to each hold the raw text of one mail message) against
the user's junk mail map.
.ie n .IP """lsm dump ~/Library/Mail/V2/MailData/LSMMap2""" 4
.el .IP "\f(CWlsm dump ~/Library/Mail/V2/MailData/LSMMap2\fR" 4
.IX Item "lsm dump ~/Library/Mail/V2/MailData/LSMMap2"
Dump the words accumulated in the junk mail map and their counts.
.ie n .IP """lsm create \-\-category\-delimiter=group c_vs_h *.c \*(Aq;\*(Aq *.h""" 4
.el .IP "\f(CWlsm create \-\-category\-delimiter=group c_vs_h *.c \*(Aq;\*(Aq *.h\fR" 4
.IX Item "lsm create --category-delimiter=group c_vs_h *.c ; *.h"
Create an \s-1LSM\s0 map trained to distinguish C header files from C source
files.
.ie n .IP """lsm update \-\-weight 2.0 \-\-cat=group c_vs_h \*(Aq;\*(Aq ../xy/*.h""" 4
.el .IP "\f(CWlsm update \-\-weight 2.0 \-\-cat=group c_vs_h \*(Aq;\*(Aq ../xy/*.h\fR" 4
.IX Item "lsm update --weight 2.0 --cat=group c_vs_h ; ../xy/*.h"
Add some additional header files with an increased weight to the training.
.ie n .IP """lsm create \-\-help""" 4
.el .IP "\f(CWlsm create \-\-help\fR" 4
.IX Item "lsm create --help"
List the options available for the \fBlsm create\fR command.