library(tidyverse)
library(tidytext)
library(tinytable)
library(scriptuRs)
library(cleanNLP)
ether <- scriptuRs::book_of_mormon |>
filter(book_title == "Ether")
daniel <- scriptuRs::old_testament |>
filter(book_title == "Daniel")
romans <- scriptuRs::new_testament |>
filter(book_title == "Romans")
all_three <- bind_rows(ether, daniel, romans) |>
# Having a unique id number column will be helpful later for joining things
mutate(id = row_number()) |>
# Put these in order: OT > NT > BoM
mutate(
book_title = factor(
book_title,
levels = c("Daniel", "Romans", "Ether"),
ordered = TRUE
)
) |>
# Making this a tibble isn't necessary, but it makes it print nicer in this document
as_tibble()
all_three
## # A tibble: 1,223 × 20
## volume_id book_id chapter_id verse_id volume_title book_title volume_long_title book_long_title volume_subtitle book_subtitle volume_short_title book_short_title volume_lds_url book_lds_url chapter_number verse_number text verse_title verse_short_title id
## <dbl> <dbl> <dbl> <dbl> <chr> <ord> <chr> <chr> <chr> <chr> <chr> <chr> <chr> <chr> <dbl> <dbl> <chr> <chr> <chr> <int>
## 1 3 80 1404 37111 Book of Mormon Ether The Book of Mormon The Book of Ether Another Testament of Jesus Christ <NA> BoM Ether bm ether 1 1 And now I, Moroni, … Ether 1:1 Ether 1:1 1
## 2 3 80 1404 37112 Book of Mormon Ether The Book of Mormon The Book of Ether Another Testament of Jesus Christ <NA> BoM Ether bm ether 1 2 And I take mine acc… Ether 1:2 Ether 1:2 2
## 3 3 80 1404 37113 Book of Mormon Ether The Book of Mormon The Book of Ether Another Testament of Jesus Christ <NA> BoM Ether bm ether 1 3 And as I suppose th… Ether 1:3 Ether 1:3 3
## 4 3 80 1404 37114 Book of Mormon Ether The Book of Mormon The Book of Ether Another Testament of Jesus Christ <NA> BoM Ether bm ether 1 4 Therefore I do not … Ether 1:4 Ether 1:4 4
## 5 3 80 1404 37115 Book of Mormon Ether The Book of Mormon The Book of Ether Another Testament of Jesus Christ <NA> BoM Ether bm ether 1 5 But behold, I give … Ether 1:5 Ether 1:5 5
## 6 3 80 1404 37116 Book of Mormon Ether The Book of Mormon The Book of Ether Another Testament of Jesus Christ <NA> BoM Ether bm ether 1 6 And on this wise do… Ether 1:6 Ether 1:6 6
## 7 3 80 1404 37117 Book of Mormon Ether The Book of Mormon The Book of Ether Another Testament of Jesus Christ <NA> BoM Ether bm ether 1 7 Coriantor was the s… Ether 1:7 Ether 1:7 7
## 8 3 80 1404 37118 Book of Mormon Ether The Book of Mormon The Book of Ether Another Testament of Jesus Christ <NA> BoM Ether bm ether 1 8 And Moron was the s… Ether 1:8 Ether 1:8 8
## 9 3 80 1404 37119 Book of Mormon Ether The Book of Mormon The Book of Ether Another Testament of Jesus Christ <NA> BoM Ether bm ether 1 9 And Ethem was the s… Ether 1:9 Ether 1:9 9
## 10 3 80 1404 37120 Book of Mormon Ether The Book of Mormon The Book of Ether Another Testament of Jesus Christ <NA> BoM Ether bm ether 1 10 And Ahah was the so… Ether 1:10 Ether 1:10 10
## # ℹ 1,213 more rowstf-idf and parts of speech
You had a really neat table showing the ten most unique words (i.e. highest tf-idf values) across the three sector-specific treatments.
I obviously don’t have your data, but I do have, um, Scripture Power™, so as an illustrative example I arbitrarily chose three books that are roughly the same length: Daniel, Romans, and Ether (all from the {scriptuRs} R package—you’ll probably need to install it if you don’t have it).
Load data
There are a bunch of columns here, but the ones we care about are book_title and text:
all_three |>
select(id, book_title, text)
## # A tibble: 1,223 × 3
## id book_title text
## <int> <ord> <chr>
## 1 1 Ether And now I, Moroni, proceed to give an account of those ancient inhabitants who were destroyed by the hand of the Lord upon the face of this north country.
## 2 2 Ether And I take mine account from the twenty and four plates which were found by the people of Limhi, which is called the Book of Ether.
## 3 3 Ether And as I suppose that the first part of this record, which speaks concerning the creation of the world, and also of Adam, and an account from that time even to the great tower, and whatsoever things transpired among the children of men until that time, is had among the Jews--
## 4 4 Ether Therefore I do not write those things which transpired from the days of Adam until that time; but they are had upon the plates; and whoso findeth them, the same will have power that he may get the full account.
## 5 5 Ether But behold, I give not the full account, but a part of the account I give, from the tower down until they were destroyed.
## 6 6 Ether And on this wise do I give the account. He that wrote this record was Ether, and he was a descendant of Coriantor.
## 7 7 Ether Coriantor was the son of Moron.
## 8 8 Ether And Moron was the son of Ethem.
## 9 9 Ether And Ethem was the son of Ahah.
## 10 10 Ether And Ahah was the son of Seth.
## # ℹ 1,213 more rowsMost unique words by book
This is what you have already—the most unique words in each sector (or book here):
# Tokenize the text into individual words
words_to_analyze <- all_three |>
unnest_tokens(output = word, input = text) |>
# Do whatever other data cleaning and filtering stuff you do here. Here I'm
# just removing stop words
anti_join(stop_words, by = join_by(word))
# Find the count of each word in each book, then find the tf-idf
words_with_tf_idf <- words_to_analyze |>
count(book_title, word) |>
bind_tf_idf(word, book_title, n) |>
arrange(desc(tf_idf))
# Find the highest tf-idf in each book
most_unique <- words_with_tf_idf |>
group_by(book_title) |>
slice_max(tf_idf, n = 10, with_ties = FALSE) |>
ungroup() |>
# Make this an ordered factor so the words plot in the right order
mutate(word = fct_inorder(word))Here’s a table:
values_for_table <- most_unique |>
select(book_title, word, tf_idf) |>
mutate(row = row_number(), .by = book_title) |>
pivot_wider(
id_cols = row,
names_from = book_title,
values_from = c(word, tf_idf),
names_glue = "{book_title}_{.value}"
) |>
select(
Daniel_word, Daniel_tf_idf,
Romans_word, Romans_tf_idf,
Ether_word, Ether_tf_idf
)
values_for_table |>
setNames(rep(c("Word", "tf-idf"), 3)) |>
tt(digits = 4) |>
group_tt(j = list("Daniel" = 1:2, "Romans" = 3:4, "Ether" = 5:6)) |>
style_tt("groupj", bold = TRUE)| Daniel | Romans | Ether | |||
|---|---|---|---|---|---|
| Word | tf-idf | Word | tf-idf | Word | tf-idf |
| daniel | 0.020589 | christ | 0.008741 | jared | 0.018432 |
| king | 0.015603 | circumcision | 0.005302 | coriantumr | 0.013502 |
| nebuchadnezzar | 0.008785 | salute | 0.005302 | land | 0.008622 |
| vision | 0.006039 | faith | 0.005088 | shiz | 0.006644 |
| king's | 0.00549 | jesus | 0.004957 | brother | 0.005932 |
| princes | 0.004941 | eateth | 0.003888 | daughters | 0.005572 |
| babylon | 0.004667 | forbid | 0.003535 | shule | 0.005572 |
| prince | 0.004667 | justified | 0.003535 | akish | 0.005144 |
| abed | 0.004118 | dead | 0.003261 | lib | 0.004715 |
| gods | 0.004118 | gentiles | 0.003131 | stead | 0.004286 |
And a plot:
# Plot this stuff
ggplot(most_unique, aes(x = tf_idf, y = fct_rev(word), fill = book_title)) +
geom_col() +
# Use Set1 from ColorBrewer for fun
# https://colorbrewer2.org/#type=qualitative&scheme=Set1&n=3
scale_fill_brewer(palette = "Set1") +
guides(fill = "none") +
labs(y = NULL, title = "Most unique words") +
facet_wrap(vars(book_title), scales = "free_y") +
theme_bw() +
theme(plot.title = element_text(face = "bold"))
Part of speech tagging
(See here for a longer, more detailed example)
R has no way of knowing if words are nouns, verbs, or adjectives. You can algorithmically predict what part of speech each word is using a part-of-speech tagger, like spaCy or Stanford’s Natural Langauge Processing (NLP) library.
These are external programs that are not written in R and don’t naturally communicate with R (spaCy is written in Python; Stanford’s CoreNLP is written in Java). There is a helpful R package named {cleanNLP} that helps you interact with these programs from within R, whis is super helpful. {cleanNLP} also comes with its own R-only tagger so you don’t need to install anything with Python or Java (however, it’s not as powerful as either spaCy, which is faster, and doesn’t deal with foreign languages like Arabic and Chinese like Stanford’s NLP library).
You can see other examples of part-of-speech tagging (along with instructions for how to install spaCy and coreNLP) here:
- “Tidy text, parts of speech, and unique words in the Bible”
- “Tidy text, parts of speech, and unique words in the Qur’an”
Here’s the general process for tagging (or “annotating”) text with the {cleanNLP} package:
Make a dataset where one column is the id (line number, chapter number, book+chapter, etc.), and another column is the text itself.
Initialize the NLP tagger. You can use any of these:
cnlp_init_udpipe(): Use an R-only tagger that should work without installing anything extra (a little slower than the others, but requires no extra steps!)cnlp_init_spacy(): Use spaCy (if you’ve installed it on your computer with Python)cnlp_init_corenlp(): Use Stanford’s NLP library (if you’ve installed it on your computer with Java)
Feed the data frame from step 1 into the
cnlp_annotate()function and wait.Save the tagged data on your computer so you don’t have to re-tag it every time.
I’ll show an example with both the R-only udpipe tagger (fast, easy, less accurate) and with spaCy (slower, requires Python, more accurate). In practice, with real data, I’d use spaCy.
For the tagger to work, each row needs to have a unique identifier. With your data, you’ll likely have a unique respondent ID from Qualtrics. With this scripture data, I created an id column at the beginning for this.
Tagging with udpipe
Actual tagging
We first need to tell R to use the udpipe part-of-speech tagger by running cnlp_init_udpipe(). We then feed each verse into cnlp_annotate() and specify that the text comes from the text column and the “document” is the id column (for each verse; yours would be the Qualtrics ID or whatever).
The results will be stored in all_three_tagged here, which isn’t a dataframe. It’s a list with two slots. The one we care about is the $token slot, which is a dataframe:
cnlp_init_udpipe()
all_three_tagged <- all_three |>
cnlp_annotate(text_name = "text", doc_name = "id")all_three_tagged$token
## # A tibble: 43,017 × 11
## doc_id sid tid token token_with_ws lemma upos xpos feats tid_source relation
## * <int> <int> <chr> <chr> <chr> <chr> <chr> <chr> <chr> <chr> <chr>
## 1 1 1 1 And "And " and CCONJ CC <NA> 7 cc
## 2 1 1 2 now "now " now ADV RB <NA> 7 advmod
## 3 1 1 3 I "I" I PRON PRP Case=Nom|Number=Sing|Person=1|PronType=Prs 7 nsubj
## 4 1 1 4 , ", " , PUNCT , <NA> 5 punct
## 5 1 1 5 Moroni "Moroni" Moroni PROPN NNP Number=Sing 7 nsubj
## 6 1 1 6 , ", " , PUNCT , <NA> 7 punct
## 7 1 1 7 proceed "proceed " proceed VERB VBD Mood=Ind|Tense=Past|VerbForm=Fin 0 root
## 8 1 1 8 to "to " to PART TO <NA> 9 mark
## 9 1 1 9 give "give " give VERB VB VerbForm=Inf 7 advcl
## 10 1 1 10 an "an " a DET DT Definite=Ind|PronType=Art 11 det
## # ℹ 43,007 more rowsThis is so cool. There are a bunch of new columns like lemma (or the base stemmed word), and upos and pos for the different parts of speech. upos uses the Universal POS tag system; pos uses the Penn Treebank code system.
Most of the other columns—like verse number and chapter number and book title and all that—got thrown away during the tagging process, but we can merge back in whatever we want. Here I’ll grab the book title, chapter number, and verse number for fun and put them in a little lookup dataset, then join that to the tagged words:
all_three_lookup <- all_three |>
select(id, book_title, chapter_number, verse_number)
all_three_full <- all_three_tagged$token |>
left_join(all_three_lookup, by = join_by(doc_id == id))
all_three_full
## # A tibble: 43,017 × 14
## doc_id sid tid token token_with_ws lemma upos xpos feats tid_source relation book_title chapter_number verse_number
## <int> <int> <chr> <chr> <chr> <chr> <chr> <chr> <chr> <chr> <chr> <ord> <dbl> <dbl>
## 1 1 1 1 And "And " and CCONJ CC <NA> 7 cc Ether 1 1
## 2 1 1 2 now "now " now ADV RB <NA> 7 advmod Ether 1 1
## 3 1 1 3 I "I" I PRON PRP Case=Nom|Number=Sing|Person=1|PronType=Prs 7 nsubj Ether 1 1
## 4 1 1 4 , ", " , PUNCT , <NA> 5 punct Ether 1 1
## 5 1 1 5 Moroni "Moroni" Moroni PROPN NNP Number=Sing 7 nsubj Ether 1 1
## 6 1 1 6 , ", " , PUNCT , <NA> 7 punct Ether 1 1
## 7 1 1 7 proceed "proceed " proceed VERB VBD Mood=Ind|Tense=Past|VerbForm=Fin 0 root Ether 1 1
## 8 1 1 8 to "to " to PART TO <NA> 9 mark Ether 1 1
## 9 1 1 9 give "give " give VERB VB VerbForm=Inf 7 advcl Ether 1 1
## 10 1 1 10 an "an " a DET DT Definite=Ind|PronType=Art 11 det Ether 1 1
## # ℹ 43,007 more rowsMost unique nouns/verbs by book
Great! Now we can do some tf-idf work on this. But one more step first! Some words can be multiple parts of speech: “call” and “impact” can be either nouns or verbs (and “impact” might actually appear in your data).
So to get around this, we’ll make a new column called lemma_upos that is the combination of the word and the part of speech, like “now_ADV” and “Moroni_PROPN”. If the word “impact” appeared in this data, we’d be able to see “impact_VERB” and “impact_NOUN”.
all_three_full <- all_three_full |>
mutate(lemma_upos = paste0(lemma, "_", upos))
all_three_full |>
select(lemma_upos)
## # A tibble: 43,017 × 1
## lemma_upos
## <chr>
## 1 and_CCONJ
## 2 now_ADV
## 3 I_PRON
## 4 ,_PUNCT
## 5 Moroni_PROPN
## 6 ,_PUNCT
## 7 proceed_VERB
## 8 to_PART
## 9 give_VERB
## 10 a_DET
## # ℹ 43,007 more rowsIt’s okay that these aren’t actual regular English words. bind_tf_idf() doesn’t care—it’s only looking at the frequency of things inside documents.
Also it’s okay that we have weird looking words like this, because we’ll use separate() to split those words back into two separate columns in the end:
tf_idf_pos <- all_three_full |>
count(book_title, lemma_upos, sort = TRUE) |>
bind_tf_idf(lemma_upos, book_title, n) |>
separate(lemma_upos, into = c("lemma", "upos"), sep = "_") |>
arrange(desc(tf_idf))
tf_idf_pos
## # A tibble: 4,229 × 7
## book_title lemma upos n tf idf tf_idf
## <ord> <chr> <chr> <int> <dbl> <dbl> <dbl>
## 1 Daniel Daniel PROPN 72 0.00535 1.10 0.00587
## 2 Daniel king NOUN 183 0.0136 0.405 0.00551
## 3 Ether Jared PROPN 83 0.00446 1.10 0.00490
## 4 Ether Coriantumr PROPN 63 0.00338 1.10 0.00372
## 5 Daniel prince NOUN 33 0.00245 1.10 0.00269
## 6 Daniel vision NOUN 30 0.00223 1.10 0.00245
## 7 Romans Christ PROPN 66 0.00603 0.405 0.00245
## 8 Ether land NOUN 111 0.00596 0.405 0.00242
## 9 Ether Shiz PROPN 30 0.00161 1.10 0.00177
## 10 Ether brother NOUN 76 0.00408 0.405 0.00166
## # ℹ 4,219 more rowsHeck yeah. “Daniel” is the most unique proper noun in the Book of Daniel while “king” is the most unique regular noun.
Some plots because why not:
most_unique_nouns <- tf_idf_pos |>
filter(upos %in% c("PROPN", "NOUN")) |>
group_by(book_title) |>
slice_max(tf_idf, n = 10) |>
ungroup() |>
mutate(word = fct_inorder(lemma))
ggplot(most_unique_nouns, aes(x = tf_idf, y = fct_rev(word), fill = book_title)) +
geom_col() +
scale_fill_brewer(palette = "Set1") +
guides(fill = "none") +
labs(y = NULL, title = "Most unique nouns") +
facet_wrap(vars(book_title), scales = "free_y") +
theme_bw() +
theme(plot.title = element_text(face = "bold"))
most_unique_verbs <- tf_idf_pos |>
filter(upos == "VERB") |>
group_by(book_title) |>
slice_max(tf_idf, n = 10) |>
ungroup() |>
mutate(word = fct_inorder(lemma))
ggplot(most_unique_verbs, aes(x = tf_idf, y = fct_rev(word), fill = book_title)) +
geom_col() +
scale_fill_brewer(palette = "Set1") +
guides(fill = "none") +
labs(y = NULL, title = "Most unique verbs") +
facet_wrap(vars(book_title), scales = "free_y") +
theme_bw() +
theme(plot.title = element_text(face = "bold"))
Tagging with spaCy
udpipe is fast, but not completely accurate. Notice how the fourth most common verb in Daniel is Nebuchadnezzar and the most common verb is “A”. Those obviously aren’t verbs. Either udpipe is bad at biblical words, or it’s not great in general.
Installing spaCy
spaCy is generally more accurate, but you need to install it and run it through Python. You can either do this on your own in terminal and manage your own Python installation (yuck) or let R create a virtual python environment for you with the {reticulate} package so you can manage and run everything from R itself.
You should only have to do this once on your computer:
library(reticulate) # For accessing Python from R
# Create a new virtual environment
virtualenv_create("tagging-fun-times")
# Install the cleannlp Python package
py_install("cleannlp", envname = "tagging-fun-times")
# Download the spaCy text data
cleanNLP::cnlp_download_spacy("en_core_web_sm")Install
uvfor easy Python installation and package management: https://docs.astral.sh/uv/#installationRun this in terminal (
uvwill automatically install Python if needed):
uv venv tagging-fun-times
source tagging-fun-times/bin/activate
uv pip install spacy
python -m spacy download enActual tagging
As before with the udpipe tagger, we’ll use cnlp_annotate(). The code is actually all identical—we just need to tell R to use the spaCy tagger with cnlp_init_spacy():
# Tell R to use the Python virtual environment made earlier
reticulate::use_virtualenv("tagging-fun-times", required = TRUE)
# Tell R to use spaCy
cnlp_init_spacy(model_name = "en_core_web_sm")
all_three_tagged_spacy <- all_three |>
cnlp_annotate(text_name = "text", doc_name = "id")all_three_tagged_spacy$token
## # A tibble: 43,011 × 10
## doc_id sid tid token token_with_ws lemma upos xpos tid_source relation
## <int> <int> <int> <chr> <chr> <chr> <chr> <chr> <int> <chr>
## 1 1 1 1 And "And " and CCONJ CC 7 cc
## 2 1 1 2 now "now " now ADV RB 7 advmod
## 3 1 1 3 I "I" I PRON PRP 7 nsubj
## 4 1 1 4 , ", " , PUNCT , 3 punct
## 5 1 1 5 Moroni "Moroni" Moroni PROPN NNP 3 appos
## 6 1 1 6 , ", " , PUNCT , 3 punct
## 7 1 1 7 proceed "proceed " proceed VERB VBP 0 root
## 8 1 1 8 to "to " to PART TO 9 aux
## 9 1 1 9 give "give " give VERB VB 7 xcomp
## 10 1 1 10 an "an " an DET DT 11 det
## # ℹ 43,001 more rowsThis part-of-speech tagging process can take a long time. If you don’t want to keep rerunning this all the time, you should probably save the dataset as some intermediate object, like an .rds file or a CSV:
# Like this:
saveRDS(all_three_tagged_spacy, "tagged-text.rds")
# Or like this:
write_csv(all_three_tagged_spacy$token, "tagged-text.csv")…and then load that instead of retagging the text:
# Like this:
all_three_tagged_spacy <- readRDS("tagged-text.rds")
# Or like this:
all_three_tagged_spacy <- read_csv("tagged-text.csv")Just like before, we have columns for lemma (or the base stemmed word), and upos (Universal POS tag) and xpos (Penn Treebank code)
We can join the other columns back in just like the udpipe version too:
all_three_lookup <- all_three |>
select(id, book_title, chapter_number, verse_number)
all_three_full_spacy <- all_three_tagged_spacy$token |>
left_join(all_three_lookup, by = join_by(doc_id == id))
all_three_full_spacy
## # A tibble: 43,011 × 13
## doc_id sid tid token token_with_ws lemma upos xpos tid_source relation book_title chapter_number verse_number
## <int> <int> <int> <chr> <chr> <chr> <chr> <chr> <int> <chr> <ord> <dbl> <dbl>
## 1 1 1 1 And "And " and CCONJ CC 7 cc Ether 1 1
## 2 1 1 2 now "now " now ADV RB 7 advmod Ether 1 1
## 3 1 1 3 I "I" I PRON PRP 7 nsubj Ether 1 1
## 4 1 1 4 , ", " , PUNCT , 3 punct Ether 1 1
## 5 1 1 5 Moroni "Moroni" Moroni PROPN NNP 3 appos Ether 1 1
## 6 1 1 6 , ", " , PUNCT , 3 punct Ether 1 1
## 7 1 1 7 proceed "proceed " proceed VERB VBP 0 root Ether 1 1
## 8 1 1 8 to "to " to PART TO 9 aux Ether 1 1
## 9 1 1 9 give "give " give VERB VB 7 xcomp Ether 1 1
## 10 1 1 10 an "an " an DET DT 11 det Ether 1 1
## # ℹ 43,001 more rowsMost unique nouns/verbs by book
Just like before, we’ll make a new column called lemma_upos that is the combination of the word and the part of speech, like “now_ADV” and “Moroni_PROPN”.
all_three_full_spacy <- all_three_full_spacy |>
mutate(lemma_upos = paste0(lemma, "_", upos))
all_three_full_spacy |>
select(lemma_upos)
## # A tibble: 43,011 × 1
## lemma_upos
## <chr>
## 1 and_CCONJ
## 2 now_ADV
## 3 I_PRON
## 4 ,_PUNCT
## 5 Moroni_PROPN
## 6 ,_PUNCT
## 7 proceed_VERB
## 8 to_PART
## 9 give_VERB
## 10 an_DET
## # ℹ 43,001 more rowsAnd then we can calculate the tf-idf:
tf_idf_pos_spacy <- all_three_full_spacy |>
count(book_title, lemma_upos, sort = TRUE) |>
bind_tf_idf(lemma_upos, book_title, n) |>
separate(lemma_upos, into = c("lemma", "upos"), sep = "_") |>
arrange(desc(tf_idf))
tf_idf_pos_spacy
## # A tibble: 3,903 × 7
## book_title lemma upos n tf idf tf_idf
## <ord> <chr> <chr> <int> <dbl> <dbl> <dbl>
## 1 Daniel Daniel PROPN 75 0.00557 1.10 0.00612
## 2 Daniel king NOUN 186 0.0138 0.405 0.00560
## 3 Ether Jared PROPN 86 0.00462 1.10 0.00508
## 4 Ether Coriantumr PROPN 62 0.00333 1.10 0.00366
## 5 Daniel prince NOUN 33 0.00245 1.10 0.00269
## 6 Daniel vision NOUN 32 0.00238 1.10 0.00261
## 7 Ether land NOUN 118 0.00634 0.405 0.00257
## 8 Romans Christ PROPN 68 0.00622 0.405 0.00252
## 9 Daniel Nebuchadnezzar PROPN 30 0.00223 1.10 0.00245
## 10 Ether show VERB 36 0.00193 1.10 0.00213
## # ℹ 3,893 more rowsFinally, some plots. These look a lot better! All the verbs are actually verbs this time—spaCy worked nicely.
most_unique_nouns_spacy <- tf_idf_pos_spacy |>
filter(upos %in% c("PROPN", "NOUN")) |>
group_by(book_title) |>
slice_max(tf_idf, n = 10) |>
ungroup() |>
mutate(word = fct_inorder(lemma))
ggplot(
most_unique_nouns_spacy,
aes(x = tf_idf, y = fct_rev(word), fill = book_title)
) +
geom_col() +
scale_fill_brewer(palette = "Set1") +
guides(fill = "none") +
labs(y = NULL, title = "Most unique nouns (spaCy)") +
facet_wrap(vars(book_title), scales = "free_y") +
theme_bw() +
theme(plot.title = element_text(face = "bold"))
most_unique_verbs_spacy <- tf_idf_pos_spacy |>
filter(upos == "VERB") |>
group_by(book_title) |>
slice_max(tf_idf, n = 10) |>
ungroup() |>
mutate(word = fct_inorder(lemma))
ggplot(
most_unique_verbs_spacy,
aes(x = tf_idf, y = fct_rev(word), fill = book_title)
) +
geom_col() +
scale_fill_brewer(palette = "Set1") +
guides(fill = "none") +
labs(y = NULL, title = "Most unique verbs (spaCy)") +
facet_wrap(vars(book_title), scales = "free_y") +
theme_bw() +
theme(plot.title = element_text(face = "bold"))
The whole game
All of that ↑ was fairly didactic and separated into multiple steps.
Here’s ↓ what the whole process looks like all together using spaCy:
library(tidyverse)
library(tidytext)
library(scriptuRs)
library(cleanNLP)
ether <- scriptuRs::book_of_mormon |>
filter(book_title == "Ether")
daniel <- scriptuRs::old_testament |>
filter(book_title == "Daniel")
romans <- scriptuRs::new_testament |>
filter(book_title == "Romans")
all_three <- bind_rows(ether, daniel, romans) |>
# Having a unique id number column will be helpful later for joining things
mutate(id = row_number()) |>
# Put these in order: OT > NT > BoM
mutate(
book_title = factor(
book_title,
levels = c("Daniel", "Romans", "Ether"),
ordered = TRUE
)
)
# --------------------------------------------------------------
# This section only needs to run once if you save the
# all_three_tagged_spacy data as RDS or CSV
# Tell R to use the Python virtual environment made earlier
reticulate::use_virtualenv("tagging-fun-times", required = TRUE)
# Tell R to use spaCy
cnlp_init_spacy(model_name = "en_core_web_sm")
# Tag the parts of speech
all_three_tagged_spacy <- all_three |>
cnlp_annotate(text_name = "text", doc_name = "id")
# Save this so it doesn't have to run again
saveRDS(all_three_tagged_spacy, "tagged-text.rds")
# --------------------------------------------------------------
# --------------------------------------------------------------
# Use this if you want to skip the tagging process and instead
# bring in the pre-tagged words that you tagged earlier
all_three_tagged_spacy <- readRDS("tagged-text.rds")
# --------------------------------------------------------------
# Bring back some of the original columns we care about
all_three_lookup <- all_three |>
select(id, book_title, chapter_number, verse_number)
# Join the original columns, make a WORD_POS word
all_three_full_spacy <- all_three_tagged_spacy$token |>
left_join(all_three_lookup, by = join_by(doc_id == id)) |>
mutate(lemma_upos = paste0(lemma, "_", upos))
# Calculate tf-idfs
tf_idf_pos_spacy <- all_three_full_spacy |>
count(book_title, lemma_upos, sort = TRUE) |>
bind_tf_idf(lemma_upos, book_title, n) |>
separate(lemma_upos, into = c("lemma", "upos"), sep = "_") |>
arrange(desc(tf_idf))
# Plot stuff
most_unique_nouns_spacy <- tf_idf_pos_spacy |>
filter(upos %in% c("PROPN", "NOUN")) |>
group_by(book_title) |>
slice_max(tf_idf, n = 10) |>
ungroup() |>
mutate(word = fct_inorder(lemma))
ggplot(
most_unique_nouns_spacy,
aes(x = tf_idf, y = fct_rev(word), fill = book_title)
) +
geom_col() +
scale_fill_brewer(palette = "Set1") +
guides(fill = "none") +
labs(y = NULL, title = "Most unique nouns (spaCy)") +
facet_wrap(vars(book_title), scales = "free_y") +
theme_bw() +
theme(plot.title = element_text(face = "bold"))