tf-idf and parts of speech

Author

Andrew Heiss

Published

November 21, 2025

You had a really neat table showing the ten most unique words (i.e. highest tf-idf values) across the three sector-specific treatments.

I obviously don’t have your data, but I do have, um, Scripture Power™, so as an illustrative example I arbitrarily chose three books that are roughly the same length: Daniel, Romans, and Ether (all from the {scriptuRs} R package—you’ll probably need to install it if you don’t have it).

Load data

library(tidyverse)
library(tidytext)
library(tinytable)
library(scriptuRs)
library(cleanNLP)

ether <- scriptuRs::book_of_mormon |>
  filter(book_title == "Ether")

daniel <- scriptuRs::old_testament |>
  filter(book_title == "Daniel")

romans <- scriptuRs::new_testament |>
  filter(book_title == "Romans")

all_three <- bind_rows(ether, daniel, romans) |>
  # Having a unique id number column will be helpful later for joining things
  mutate(id = row_number()) |>
  # Put these in order: OT > NT > BoM
  mutate(
    book_title = factor(
      book_title,
      levels = c("Daniel", "Romans", "Ether"),
      ordered = TRUE
    )
  ) |>
  # Making this a tibble isn't necessary, but it makes it print nicer in this document
  as_tibble()
all_three
## # A tibble: 1,223 × 20
##    volume_id book_id chapter_id verse_id volume_title   book_title volume_long_title  book_long_title   volume_subtitle                   book_subtitle volume_short_title book_short_title volume_lds_url book_lds_url chapter_number verse_number text                 verse_title verse_short_title    id
##        <dbl>   <dbl>      <dbl>    <dbl> <chr>          <ord>      <chr>              <chr>             <chr>                             <chr>         <chr>              <chr>            <chr>          <chr>                 <dbl>        <dbl> <chr>                <chr>       <chr>             <int>
##  1         3      80       1404    37111 Book of Mormon Ether      The Book of Mormon The Book of Ether Another Testament of Jesus Christ <NA>          BoM                Ether            bm             ether                     1            1 And now I, Moroni, … Ether 1:1   Ether 1:1             1
##  2         3      80       1404    37112 Book of Mormon Ether      The Book of Mormon The Book of Ether Another Testament of Jesus Christ <NA>          BoM                Ether            bm             ether                     1            2 And I take mine acc… Ether 1:2   Ether 1:2             2
##  3         3      80       1404    37113 Book of Mormon Ether      The Book of Mormon The Book of Ether Another Testament of Jesus Christ <NA>          BoM                Ether            bm             ether                     1            3 And as I suppose th… Ether 1:3   Ether 1:3             3
##  4         3      80       1404    37114 Book of Mormon Ether      The Book of Mormon The Book of Ether Another Testament of Jesus Christ <NA>          BoM                Ether            bm             ether                     1            4 Therefore I do not … Ether 1:4   Ether 1:4             4
##  5         3      80       1404    37115 Book of Mormon Ether      The Book of Mormon The Book of Ether Another Testament of Jesus Christ <NA>          BoM                Ether            bm             ether                     1            5 But behold, I give … Ether 1:5   Ether 1:5             5
##  6         3      80       1404    37116 Book of Mormon Ether      The Book of Mormon The Book of Ether Another Testament of Jesus Christ <NA>          BoM                Ether            bm             ether                     1            6 And on this wise do… Ether 1:6   Ether 1:6             6
##  7         3      80       1404    37117 Book of Mormon Ether      The Book of Mormon The Book of Ether Another Testament of Jesus Christ <NA>          BoM                Ether            bm             ether                     1            7 Coriantor was the s… Ether 1:7   Ether 1:7             7
##  8         3      80       1404    37118 Book of Mormon Ether      The Book of Mormon The Book of Ether Another Testament of Jesus Christ <NA>          BoM                Ether            bm             ether                     1            8 And Moron was the s… Ether 1:8   Ether 1:8             8
##  9         3      80       1404    37119 Book of Mormon Ether      The Book of Mormon The Book of Ether Another Testament of Jesus Christ <NA>          BoM                Ether            bm             ether                     1            9 And Ethem was the s… Ether 1:9   Ether 1:9             9
## 10         3      80       1404    37120 Book of Mormon Ether      The Book of Mormon The Book of Ether Another Testament of Jesus Christ <NA>          BoM                Ether            bm             ether                     1           10 And Ahah was the so… Ether 1:10  Ether 1:10           10
## # ℹ 1,213 more rows

There are a bunch of columns here, but the ones we care about are book_title and text:

all_three |> 
  select(id, book_title, text)
## # A tibble: 1,223 × 3
##       id book_title text                                                                                                                                                                                                                                                                                
##    <int> <ord>      <chr>                                                                                                                                                                                                                                                                               
##  1     1 Ether      And now I, Moroni, proceed to give an account of those ancient inhabitants who were destroyed by the hand of the Lord upon the face of this north country.                                                                                                                          
##  2     2 Ether      And I take mine account from the twenty and four plates which were found by the people of Limhi, which is called the Book of Ether.                                                                                                                                                 
##  3     3 Ether      And as I suppose that the first part of this record, which speaks concerning the creation of the world, and also of Adam, and an account from that time even to the great tower, and whatsoever things transpired among the children of men until that time, is had among the Jews--
##  4     4 Ether      Therefore I do not write those things which transpired from the days of Adam until that time; but they are had upon the plates; and whoso findeth them, the same will have power that he may get the full account.                                                                  
##  5     5 Ether      But behold, I give not the full account, but a part of the account I give, from the tower down until they were destroyed.                                                                                                                                                           
##  6     6 Ether      And on this wise do I give the account. He that wrote this record was Ether, and he was a descendant of Coriantor.                                                                                                                                                                  
##  7     7 Ether      Coriantor was the son of Moron.                                                                                                                                                                                                                                                     
##  8     8 Ether      And Moron was the son of Ethem.                                                                                                                                                                                                                                                     
##  9     9 Ether      And Ethem was the son of Ahah.                                                                                                                                                                                                                                                      
## 10    10 Ether      And Ahah was the son of Seth.                                                                                                                                                                                                                                                       
## # ℹ 1,213 more rows

Most unique words by book

This is what you have already—the most unique words in each sector (or book here):

# Tokenize the text into individual words
words_to_analyze <- all_three |> 
  unnest_tokens(output = word, input = text) |> 
  # Do whatever other data cleaning and filtering stuff you do here. Here I'm 
  # just removing stop words
  anti_join(stop_words, by = join_by(word))

# Find the count of each word in each book, then find the tf-idf
words_with_tf_idf <- words_to_analyze |> 
  count(book_title, word) |> 
  bind_tf_idf(word, book_title, n) |> 
  arrange(desc(tf_idf))

# Find the highest tf-idf in each book
most_unique <- words_with_tf_idf |> 
  group_by(book_title) |> 
  slice_max(tf_idf, n = 10, with_ties = FALSE) |> 
  ungroup() |> 
  # Make this an ordered factor so the words plot in the right order
  mutate(word = fct_inorder(word))

Here’s a table:

values_for_table <- most_unique |>
  select(book_title, word, tf_idf) |>
  mutate(row = row_number(), .by = book_title) |>
  pivot_wider(
    id_cols = row,
    names_from = book_title,
    values_from = c(word, tf_idf),
    names_glue = "{book_title}_{.value}"
  ) |>
  select(
    Daniel_word, Daniel_tf_idf,
    Romans_word, Romans_tf_idf,
    Ether_word, Ether_tf_idf
  )

values_for_table |> 
  setNames(rep(c("Word", "tf-idf"), 3)) |> 
  tt(digits = 4) |> 
  group_tt(j = list("Daniel" = 1:2, "Romans" = 3:4, "Ether" = 5:6)) |> 
  style_tt("groupj", bold = TRUE)
Daniel Romans Ether
Word tf-idf Word tf-idf Word tf-idf
daniel 0.020589 christ 0.008741 jared 0.018432
king 0.015603 circumcision 0.005302 coriantumr 0.013502
nebuchadnezzar 0.008785 salute 0.005302 land 0.008622
vision 0.006039 faith 0.005088 shiz 0.006644
king's 0.00549 jesus 0.004957 brother 0.005932
princes 0.004941 eateth 0.003888 daughters 0.005572
babylon 0.004667 forbid 0.003535 shule 0.005572
prince 0.004667 justified 0.003535 akish 0.005144
abed 0.004118 dead 0.003261 lib 0.004715
gods 0.004118 gentiles 0.003131 stead 0.004286

 

And a plot:

# Plot this stuff
ggplot(most_unique, aes(x = tf_idf, y = fct_rev(word), fill = book_title)) +
  geom_col() + 
  # Use Set1 from ColorBrewer for fun
  # https://colorbrewer2.org/#type=qualitative&scheme=Set1&n=3
  scale_fill_brewer(palette = "Set1") +
  guides(fill = "none") + 
  labs(y = NULL, title = "Most unique words") +
  facet_wrap(vars(book_title), scales = "free_y") +
  theme_bw() +
  theme(plot.title = element_text(face = "bold"))

Part of speech tagging

(See here for a longer, more detailed example)

R has no way of knowing if words are nouns, verbs, or adjectives. You can algorithmically predict what part of speech each word is using a part-of-speech tagger, like spaCy or Stanford’s Natural Langauge Processing (NLP) library.

These are external programs that are not written in R and don’t naturally communicate with R (spaCy is written in Python; Stanford’s CoreNLP is written in Java). There is a helpful R package named {cleanNLP} that helps you interact with these programs from within R, whis is super helpful. {cleanNLP} also comes with its own R-only tagger so you don’t need to install anything with Python or Java (however, it’s not as powerful as either spaCy, which is faster, and doesn’t deal with foreign languages like Arabic and Chinese like Stanford’s NLP library).

You can see other examples of part-of-speech tagging (along with instructions for how to install spaCy and coreNLP) here:

Here’s the general process for tagging (or “annotating”) text with the {cleanNLP} package:

  1. Make a dataset where one column is the id (line number, chapter number, book+chapter, etc.), and another column is the text itself.

  2. Initialize the NLP tagger. You can use any of these:

    • cnlp_init_udpipe(): Use an R-only tagger that should work without installing anything extra (a little slower than the others, but requires no extra steps!)
    • cnlp_init_spacy(): Use spaCy (if you’ve installed it on your computer with Python)
    • cnlp_init_corenlp(): Use Stanford’s NLP library (if you’ve installed it on your computer with Java)
  3. Feed the data frame from step 1 into the cnlp_annotate() function and wait.

  4. Save the tagged data on your computer so you don’t have to re-tag it every time.

I’ll show an example with both the R-only udpipe tagger (fast, easy, less accurate) and with spaCy (slower, requires Python, more accurate). In practice, with real data, I’d use spaCy.

For the tagger to work, each row needs to have a unique identifier. With your data, you’ll likely have a unique respondent ID from Qualtrics. With this scripture data, I created an id column at the beginning for this.

Tagging with udpipe

Actual tagging

We first need to tell R to use the udpipe part-of-speech tagger by running cnlp_init_udpipe(). We then feed each verse into cnlp_annotate() and specify that the text comes from the text column and the “document” is the id column (for each verse; yours would be the Qualtrics ID or whatever).

The results will be stored in all_three_tagged here, which isn’t a dataframe. It’s a list with two slots. The one we care about is the $token slot, which is a dataframe:

cnlp_init_udpipe()

all_three_tagged <- all_three |> 
  cnlp_annotate(text_name = "text", doc_name = "id")
all_three_tagged$token
## # A tibble: 43,017 × 11
##    doc_id   sid tid   token   token_with_ws lemma   upos  xpos  feats                                      tid_source relation
##  *  <int> <int> <chr> <chr>   <chr>         <chr>   <chr> <chr> <chr>                                      <chr>      <chr>   
##  1      1     1 1     And     "And "        and     CCONJ CC    <NA>                                       7          cc      
##  2      1     1 2     now     "now "        now     ADV   RB    <NA>                                       7          advmod  
##  3      1     1 3     I       "I"           I       PRON  PRP   Case=Nom|Number=Sing|Person=1|PronType=Prs 7          nsubj   
##  4      1     1 4     ,       ", "          ,       PUNCT ,     <NA>                                       5          punct   
##  5      1     1 5     Moroni  "Moroni"      Moroni  PROPN NNP   Number=Sing                                7          nsubj   
##  6      1     1 6     ,       ", "          ,       PUNCT ,     <NA>                                       7          punct   
##  7      1     1 7     proceed "proceed "    proceed VERB  VBD   Mood=Ind|Tense=Past|VerbForm=Fin           0          root    
##  8      1     1 8     to      "to "         to      PART  TO    <NA>                                       9          mark    
##  9      1     1 9     give    "give "       give    VERB  VB    VerbForm=Inf                               7          advcl   
## 10      1     1 10    an      "an "         a       DET   DT    Definite=Ind|PronType=Art                  11         det     
## # ℹ 43,007 more rows

This is so cool. There are a bunch of new columns like lemma (or the base stemmed word), and upos and pos for the different parts of speech. upos uses the Universal POS tag system; pos uses the Penn Treebank code system.

Most of the other columns—like verse number and chapter number and book title and all that—got thrown away during the tagging process, but we can merge back in whatever we want. Here I’ll grab the book title, chapter number, and verse number for fun and put them in a little lookup dataset, then join that to the tagged words:

all_three_lookup <- all_three |> 
  select(id, book_title, chapter_number, verse_number)

all_three_full <- all_three_tagged$token |> 
  left_join(all_three_lookup, by = join_by(doc_id == id))
all_three_full
## # A tibble: 43,017 × 14
##    doc_id   sid tid   token   token_with_ws lemma   upos  xpos  feats                                      tid_source relation book_title chapter_number verse_number
##     <int> <int> <chr> <chr>   <chr>         <chr>   <chr> <chr> <chr>                                      <chr>      <chr>    <ord>               <dbl>        <dbl>
##  1      1     1 1     And     "And "        and     CCONJ CC    <NA>                                       7          cc       Ether                   1            1
##  2      1     1 2     now     "now "        now     ADV   RB    <NA>                                       7          advmod   Ether                   1            1
##  3      1     1 3     I       "I"           I       PRON  PRP   Case=Nom|Number=Sing|Person=1|PronType=Prs 7          nsubj    Ether                   1            1
##  4      1     1 4     ,       ", "          ,       PUNCT ,     <NA>                                       5          punct    Ether                   1            1
##  5      1     1 5     Moroni  "Moroni"      Moroni  PROPN NNP   Number=Sing                                7          nsubj    Ether                   1            1
##  6      1     1 6     ,       ", "          ,       PUNCT ,     <NA>                                       7          punct    Ether                   1            1
##  7      1     1 7     proceed "proceed "    proceed VERB  VBD   Mood=Ind|Tense=Past|VerbForm=Fin           0          root     Ether                   1            1
##  8      1     1 8     to      "to "         to      PART  TO    <NA>                                       9          mark     Ether                   1            1
##  9      1     1 9     give    "give "       give    VERB  VB    VerbForm=Inf                               7          advcl    Ether                   1            1
## 10      1     1 10    an      "an "         a       DET   DT    Definite=Ind|PronType=Art                  11         det      Ether                   1            1
## # ℹ 43,007 more rows

Most unique nouns/verbs by book

Great! Now we can do some tf-idf work on this. But one more step first! Some words can be multiple parts of speech: “call” and “impact” can be either nouns or verbs (and “impact” might actually appear in your data).

So to get around this, we’ll make a new column called lemma_upos that is the combination of the word and the part of speech, like “now_ADV” and “Moroni_PROPN”. If the word “impact” appeared in this data, we’d be able to see “impact_VERB” and “impact_NOUN”.

all_three_full <- all_three_full |> 
  mutate(lemma_upos = paste0(lemma, "_", upos))

all_three_full |> 
  select(lemma_upos)
## # A tibble: 43,017 × 1
##    lemma_upos  
##    <chr>       
##  1 and_CCONJ   
##  2 now_ADV     
##  3 I_PRON      
##  4 ,_PUNCT     
##  5 Moroni_PROPN
##  6 ,_PUNCT     
##  7 proceed_VERB
##  8 to_PART     
##  9 give_VERB   
## 10 a_DET       
## # ℹ 43,007 more rows

It’s okay that these aren’t actual regular English words. bind_tf_idf() doesn’t care—it’s only looking at the frequency of things inside documents.

Also it’s okay that we have weird looking words like this, because we’ll use separate() to split those words back into two separate columns in the end:

tf_idf_pos <- all_three_full |> 
  count(book_title, lemma_upos, sort = TRUE) |> 
  bind_tf_idf(lemma_upos, book_title, n) |> 
  separate(lemma_upos, into = c("lemma", "upos"), sep = "_") |> 
  arrange(desc(tf_idf))
tf_idf_pos
## # A tibble: 4,229 × 7
##    book_title lemma      upos      n      tf   idf  tf_idf
##    <ord>      <chr>      <chr> <int>   <dbl> <dbl>   <dbl>
##  1 Daniel     Daniel     PROPN    72 0.00535 1.10  0.00587
##  2 Daniel     king       NOUN    183 0.0136  0.405 0.00551
##  3 Ether      Jared      PROPN    83 0.00446 1.10  0.00490
##  4 Ether      Coriantumr PROPN    63 0.00338 1.10  0.00372
##  5 Daniel     prince     NOUN     33 0.00245 1.10  0.00269
##  6 Daniel     vision     NOUN     30 0.00223 1.10  0.00245
##  7 Romans     Christ     PROPN    66 0.00603 0.405 0.00245
##  8 Ether      land       NOUN    111 0.00596 0.405 0.00242
##  9 Ether      Shiz       PROPN    30 0.00161 1.10  0.00177
## 10 Ether      brother    NOUN     76 0.00408 0.405 0.00166
## # ℹ 4,219 more rows

Heck yeah. “Daniel” is the most unique proper noun in the Book of Daniel while “king” is the most unique regular noun.

Some plots because why not:

most_unique_nouns <- tf_idf_pos |> 
  filter(upos %in% c("PROPN", "NOUN")) |> 
  group_by(book_title) |> 
  slice_max(tf_idf, n = 10) |> 
  ungroup() |> 
  mutate(word = fct_inorder(lemma))

ggplot(most_unique_nouns, aes(x = tf_idf, y = fct_rev(word), fill = book_title)) +
  geom_col() + 
  scale_fill_brewer(palette = "Set1") +
  guides(fill = "none") + 
  labs(y = NULL, title = "Most unique nouns") +
  facet_wrap(vars(book_title), scales = "free_y") +
  theme_bw() +
  theme(plot.title = element_text(face = "bold"))

most_unique_verbs <- tf_idf_pos |> 
  filter(upos == "VERB") |> 
  group_by(book_title) |> 
  slice_max(tf_idf, n = 10) |> 
  ungroup() |> 
  mutate(word = fct_inorder(lemma))

ggplot(most_unique_verbs, aes(x = tf_idf, y = fct_rev(word), fill = book_title)) +
  geom_col() + 
  scale_fill_brewer(palette = "Set1") +
  guides(fill = "none") + 
  labs(y = NULL, title = "Most unique verbs") +
  facet_wrap(vars(book_title), scales = "free_y") +
  theme_bw() +
  theme(plot.title = element_text(face = "bold"))

Tagging with spaCy

udpipe is fast, but not completely accurate. Notice how the fourth most common verb in Daniel is Nebuchadnezzar and the most common verb is “A”. Those obviously aren’t verbs. Either udpipe is bad at biblical words, or it’s not great in general.

Installing spaCy

spaCy is generally more accurate, but you need to install it and run it through Python. You can either do this on your own in terminal and manage your own Python installation (yuck) or let R create a virtual python environment for you with the {reticulate} package so you can manage and run everything from R itself.

You should only have to do this once on your computer:

library(reticulate)  # For accessing Python from R

# Create a new virtual environment
virtualenv_create("tagging-fun-times")

# Install the cleannlp Python package
py_install("cleannlp", envname = "tagging-fun-times")

# Download the spaCy text data
cleanNLP::cnlp_download_spacy("en_core_web_sm")
  1. Install uv for easy Python installation and package management: https://docs.astral.sh/uv/#installation

  2. Run this in terminal (uv will automatically install Python if needed):

uv venv tagging-fun-times
source tagging-fun-times/bin/activate
uv pip install spacy
python -m spacy download en

Actual tagging

As before with the udpipe tagger, we’ll use cnlp_annotate(). The code is actually all identical—we just need to tell R to use the spaCy tagger with cnlp_init_spacy():

# Tell R to use the Python virtual environment made earlier
reticulate::use_virtualenv("tagging-fun-times", required = TRUE)

# Tell R to use spaCy
cnlp_init_spacy(model_name = "en_core_web_sm")

all_three_tagged_spacy <- all_three |> 
  cnlp_annotate(text_name = "text", doc_name = "id")
all_three_tagged_spacy$token
## # A tibble: 43,011 × 10
##    doc_id   sid   tid token   token_with_ws lemma   upos  xpos  tid_source relation
##     <int> <int> <int> <chr>   <chr>         <chr>   <chr> <chr>      <int> <chr>   
##  1      1     1     1 And     "And "        and     CCONJ CC             7 cc      
##  2      1     1     2 now     "now "        now     ADV   RB             7 advmod  
##  3      1     1     3 I       "I"           I       PRON  PRP            7 nsubj   
##  4      1     1     4 ,       ", "          ,       PUNCT ,              3 punct   
##  5      1     1     5 Moroni  "Moroni"      Moroni  PROPN NNP            3 appos   
##  6      1     1     6 ,       ", "          ,       PUNCT ,              3 punct   
##  7      1     1     7 proceed "proceed "    proceed VERB  VBP            0 root    
##  8      1     1     8 to      "to "         to      PART  TO             9 aux     
##  9      1     1     9 give    "give "       give    VERB  VB             7 xcomp   
## 10      1     1    10 an      "an "         an      DET   DT            11 det     
## # ℹ 43,001 more rows
NoteSave the intermediate data!

This part-of-speech tagging process can take a long time. If you don’t want to keep rerunning this all the time, you should probably save the dataset as some intermediate object, like an .rds file or a CSV:

# Like this:
saveRDS(all_three_tagged_spacy, "tagged-text.rds")

# Or like this:
write_csv(all_three_tagged_spacy$token, "tagged-text.csv")

…and then load that instead of retagging the text:

# Like this:
all_three_tagged_spacy <- readRDS("tagged-text.rds")

# Or like this:
all_three_tagged_spacy <- read_csv("tagged-text.csv")

Just like before, we have columns for lemma (or the base stemmed word), and upos (Universal POS tag) and xpos (Penn Treebank code)

We can join the other columns back in just like the udpipe version too:

all_three_lookup <- all_three |> 
  select(id, book_title, chapter_number, verse_number)

all_three_full_spacy <- all_three_tagged_spacy$token |> 
  left_join(all_three_lookup, by = join_by(doc_id == id))
all_three_full_spacy
## # A tibble: 43,011 × 13
##    doc_id   sid   tid token   token_with_ws lemma   upos  xpos  tid_source relation book_title chapter_number verse_number
##     <int> <int> <int> <chr>   <chr>         <chr>   <chr> <chr>      <int> <chr>    <ord>               <dbl>        <dbl>
##  1      1     1     1 And     "And "        and     CCONJ CC             7 cc       Ether                   1            1
##  2      1     1     2 now     "now "        now     ADV   RB             7 advmod   Ether                   1            1
##  3      1     1     3 I       "I"           I       PRON  PRP            7 nsubj    Ether                   1            1
##  4      1     1     4 ,       ", "          ,       PUNCT ,              3 punct    Ether                   1            1
##  5      1     1     5 Moroni  "Moroni"      Moroni  PROPN NNP            3 appos    Ether                   1            1
##  6      1     1     6 ,       ", "          ,       PUNCT ,              3 punct    Ether                   1            1
##  7      1     1     7 proceed "proceed "    proceed VERB  VBP            0 root     Ether                   1            1
##  8      1     1     8 to      "to "         to      PART  TO             9 aux      Ether                   1            1
##  9      1     1     9 give    "give "       give    VERB  VB             7 xcomp    Ether                   1            1
## 10      1     1    10 an      "an "         an      DET   DT            11 det      Ether                   1            1
## # ℹ 43,001 more rows

Most unique nouns/verbs by book

Just like before, we’ll make a new column called lemma_upos that is the combination of the word and the part of speech, like “now_ADV” and “Moroni_PROPN”.

all_three_full_spacy <- all_three_full_spacy |> 
  mutate(lemma_upos = paste0(lemma, "_", upos))

all_three_full_spacy |> 
  select(lemma_upos)
## # A tibble: 43,011 × 1
##    lemma_upos  
##    <chr>       
##  1 and_CCONJ   
##  2 now_ADV     
##  3 I_PRON      
##  4 ,_PUNCT     
##  5 Moroni_PROPN
##  6 ,_PUNCT     
##  7 proceed_VERB
##  8 to_PART     
##  9 give_VERB   
## 10 an_DET      
## # ℹ 43,001 more rows

And then we can calculate the tf-idf:

tf_idf_pos_spacy <- all_three_full_spacy |> 
  count(book_title, lemma_upos, sort = TRUE) |> 
  bind_tf_idf(lemma_upos, book_title, n) |> 
  separate(lemma_upos, into = c("lemma", "upos"), sep = "_") |> 
  arrange(desc(tf_idf))
tf_idf_pos_spacy
## # A tibble: 3,903 × 7
##    book_title lemma          upos      n      tf   idf  tf_idf
##    <ord>      <chr>          <chr> <int>   <dbl> <dbl>   <dbl>
##  1 Daniel     Daniel         PROPN    75 0.00557 1.10  0.00612
##  2 Daniel     king           NOUN    186 0.0138  0.405 0.00560
##  3 Ether      Jared          PROPN    86 0.00462 1.10  0.00508
##  4 Ether      Coriantumr     PROPN    62 0.00333 1.10  0.00366
##  5 Daniel     prince         NOUN     33 0.00245 1.10  0.00269
##  6 Daniel     vision         NOUN     32 0.00238 1.10  0.00261
##  7 Ether      land           NOUN    118 0.00634 0.405 0.00257
##  8 Romans     Christ         PROPN    68 0.00622 0.405 0.00252
##  9 Daniel     Nebuchadnezzar PROPN    30 0.00223 1.10  0.00245
## 10 Ether      show           VERB     36 0.00193 1.10  0.00213
## # ℹ 3,893 more rows

Finally, some plots. These look a lot better! All the verbs are actually verbs this time—spaCy worked nicely.

most_unique_nouns_spacy <- tf_idf_pos_spacy |>
  filter(upos %in% c("PROPN", "NOUN")) |>
  group_by(book_title) |>
  slice_max(tf_idf, n = 10) |>
  ungroup() |>
  mutate(word = fct_inorder(lemma))

ggplot(
  most_unique_nouns_spacy,
  aes(x = tf_idf, y = fct_rev(word), fill = book_title)
) +
  geom_col() +
  scale_fill_brewer(palette = "Set1") +
  guides(fill = "none") +
  labs(y = NULL, title = "Most unique nouns (spaCy)") +
  facet_wrap(vars(book_title), scales = "free_y") +
  theme_bw() +
  theme(plot.title = element_text(face = "bold"))

most_unique_verbs_spacy <- tf_idf_pos_spacy |>
  filter(upos == "VERB") |>
  group_by(book_title) |>
  slice_max(tf_idf, n = 10) |>
  ungroup() |>
  mutate(word = fct_inorder(lemma))

ggplot(
  most_unique_verbs_spacy,
  aes(x = tf_idf, y = fct_rev(word), fill = book_title)
) +
  geom_col() +
  scale_fill_brewer(palette = "Set1") +
  guides(fill = "none") +
  labs(y = NULL, title = "Most unique verbs (spaCy)") +
  facet_wrap(vars(book_title), scales = "free_y") +
  theme_bw() +
  theme(plot.title = element_text(face = "bold"))

The whole game

All of that ↑ was fairly didactic and separated into multiple steps.

Here’s ↓ what the whole process looks like all together using spaCy:

library(tidyverse)
library(tidytext)
library(scriptuRs)
library(cleanNLP)

ether <- scriptuRs::book_of_mormon |>
  filter(book_title == "Ether")

daniel <- scriptuRs::old_testament |>
  filter(book_title == "Daniel")

romans <- scriptuRs::new_testament |>
  filter(book_title == "Romans")

all_three <- bind_rows(ether, daniel, romans) |>
  # Having a unique id number column will be helpful later for joining things
  mutate(id = row_number()) |>
  # Put these in order: OT > NT > BoM
  mutate(
    book_title = factor(
      book_title,
      levels = c("Daniel", "Romans", "Ether"),
      ordered = TRUE
    )
  )

# --------------------------------------------------------------
# This section only needs to run once if you save the 
# all_three_tagged_spacy data as RDS or CSV

# Tell R to use the Python virtual environment made earlier
reticulate::use_virtualenv("tagging-fun-times", required = TRUE)

# Tell R to use spaCy
cnlp_init_spacy(model_name = "en_core_web_sm")

# Tag the parts of speech
all_three_tagged_spacy <- all_three |> 
  cnlp_annotate(text_name = "text", doc_name = "id")

# Save this so it doesn't have to run again
saveRDS(all_three_tagged_spacy, "tagged-text.rds")
# --------------------------------------------------------------

# --------------------------------------------------------------
# Use this if you want to skip the tagging process and instead 
# bring in the pre-tagged words that you tagged earlier
all_three_tagged_spacy <- readRDS("tagged-text.rds")
# --------------------------------------------------------------

# Bring back some of the original columns we care about
all_three_lookup <- all_three |> 
  select(id, book_title, chapter_number, verse_number)

# Join the original columns, make a WORD_POS word
all_three_full_spacy <- all_three_tagged_spacy$token |> 
  left_join(all_three_lookup, by = join_by(doc_id == id)) |> 
  mutate(lemma_upos = paste0(lemma, "_", upos))

# Calculate tf-idfs
tf_idf_pos_spacy <- all_three_full_spacy |> 
  count(book_title, lemma_upos, sort = TRUE) |> 
  bind_tf_idf(lemma_upos, book_title, n) |> 
  separate(lemma_upos, into = c("lemma", "upos"), sep = "_") |> 
  arrange(desc(tf_idf))

# Plot stuff
most_unique_nouns_spacy <- tf_idf_pos_spacy |>
  filter(upos %in% c("PROPN", "NOUN")) |>
  group_by(book_title) |>
  slice_max(tf_idf, n = 10) |>
  ungroup() |>
  mutate(word = fct_inorder(lemma))

ggplot(
  most_unique_nouns_spacy,
  aes(x = tf_idf, y = fct_rev(word), fill = book_title)
) +
  geom_col() +
  scale_fill_brewer(palette = "Set1") +
  guides(fill = "none") +
  labs(y = NULL, title = "Most unique nouns (spaCy)") +
  facet_wrap(vars(book_title), scales = "free_y") +
  theme_bw() +
  theme(plot.title = element_text(face = "bold"))