Load the library. This is available from CRAN and the latest development versions can be found on GitHub.
library(CorporaCoCo)
A passage from Alice’s Adventures in Wonderland:
<- "`But do cats eat bats, I wonder?' And here Alice began to get rather sleepy, and went on saying to herself, in a dreamy sort of way, `Do cats eat bats? Do cats eat bats?' and sometimes, `Do bats eat cats?' for, you see, as she couldn't answer either question, it didn't much matter which way she put it." a
Create a corp_text
object:
<- corp_text(a) a_text
We used the default tokenization, so let’s look at the type mappings:
corp_type_lookup(a_text)
# type tokens
# 1: a a
# 2: alice Alice
# 3: and And, and
# 4: answer answer
# 5: as as
# 6: bats bats
# 7: began began
# 8: but But
# 9: cats cats
# 10: couldn't couldn't
# 11: didn't didn't
# 12: do do, Do
# 13: dreamy dreamy
# 14: eat eat
# 15: either either
# 16: for for
# 17: get get
# 18: here here
# 19: herself herself
# 20: i I
# 21: in in
# 22: it it
# 23: matter matter
# 24: much much
# 25: of of
# 26: on on
# 27: put put
# 28: question question
# 29: rather rather
# 30: saying saying
# 31: see see
# 32: she she
# 33: sleepy sleepy
# 34: sometimes sometimes
# 35: sort sort
# 36: to to
# 37: way way
# 38: went went
# 39: which which
# 40: wonder wonder
# 41: you you
# type tokens
You can combine corp_text
objects:
<- "The cat sat on the mat."
aa <- "This dog ate this cat."
bb <- corp_text(aa)
aaa <- corp_text(bb)
bbb <- list(aaa, bbb)
ccc corp_text_rbindlist(ccc)
# $text
# [1] "The cat sat on the mat. This dog ate this cat."
#
# $tokens
# idx type start end token
# 1: 1 the 1 3 The
# 2: 2 cat 5 7 cat
# 3: 3 sat 9 11 sat
# 4: 4 on 13 14 on
# 5: 5 the 16 18 the
# 6: 6 mat 20 22 mat
# 7: 7 this 25 28 This
# 8: 8 dog 30 32 dog
# 9: 9 ate 34 36 ate
# 10: 10 this 38 41 this
# 11: 11 cat 43 45 cat
#
# attr(,"class")
# [1] "corp_text"
# attr(,"PACKAGE_VERSION")
# [1] '2.0'
# attr(,"DATE")
# [1] "2022-08-05"
You can look at concordance lines using corp_text
objects:
corp_concordance(a_text, span = "4LR", nodes = c("eat"))
# [1] NA--- NANABut do cats eat bats, I wonder?' And --- here Alice began
# [2] a dreamy sort --- of way, `Do cats eat bats? Do cats eat --- bats?' and sometimes
# [3] way, `Do cats --- eat bats? Do cats eat bats?' and sometimes, `Do --- bats eat cats
# [4] cats eat bats?' --- and sometimes, `Do bats eat cats?' for, you see, --- as she couldn't
If you supply one or more collocates
, they will be highlighted and act as a filter:
# filtering for one collocate
<- corp_concordance(a_text, span = "4LR", nodes = c("eat"), collocates = c("but"))
y
y# [1] NA--- NANA*But* do cats eat bats, I wonder?' And --- here Alice began
# filtering for two collocates
<- corp_concordance(a_text, span = "4LR", nodes = c("eat"), collocates = c("but", "and"))
y
y# [1] NA--- NANA*But* do cats eat bats, I wonder?' *And* --- here Alice began
# [2] way, `Do cats --- eat bats? Do cats eat bats?' *and* sometimes, `Do --- bats eat cats
# [3] cats eat bats?' --- *and* sometimes, `Do bats eat cats?' for, you see, --- as she couldn't
If you do not want the collocates to act as a filter, but you just want to highlight them in the full concordance, you can add collocates
to the print
function:
# rerunning the original concordance without a collocate filter
<- corp_concordance(a_text, span = "4LR", nodes = c("eat"))
o
# printing the concordance and adding collocate highlighting
print(o, collocates = c("but"))
# [1] NA--- NANA*But* do cats eat bats, I wonder?' And --- here Alice began
# [2] a dreamy sort --- of way, `Do cats eat bats? Do cats eat --- bats?' and sometimes
# [3] way, `Do cats --- eat bats? Do cats eat bats?' and sometimes, `Do --- bats eat cats
# [4] cats eat bats?' --- and sometimes, `Do bats eat cats?' for, you see, --- as she couldn't
Because a corp_concordance
object is just a data.table
you can sort it, filter it. So in the following example of a slightly longer concordance, we can demonstrate the sorting by 1. the node and 2. the first type to the right of the node (“R1_type” position):
<- corp_concordance(a_text, span = "4LR", nodes = c("cats", "bats"))
s order(N, R1_type)]
s[# [1] Do cats eat --- bats? Do cats eat bats?' and sometimes, `Do bats --- eat cats?' for
# [2] dreamy sort of --- way, `Do cats eat bats? Do cats eat bats?' --- and sometimes, `Do
# [3] Do cats eat --- bats?' and sometimes, `Do bats eat cats?' for, you --- see, as she
# [4] NA--- But do cats eat bats, I wonder?' And here --- Alice began to
# [5] NA--- NANANANABut do cats eat bats, I wonder?' --- And here Alice
# [6] in a dreamy --- sort of way, `Do cats eat bats? Do cats --- eat bats?' and
# [7] of way, `Do --- cats eat bats? Do cats eat bats?' and sometimes, `--- Do bats eat
# [8] eat bats?' and --- sometimes, `Do bats eat cats?' for, you see, as --- she couldn't answer
As with another data.table
, you can also save the output to a csv file using write.csv
, e.g. write.csv(s, "concordance.csv")
.
You can sort the concordance using the types or the tokens:
names(y)
# [1] "idx" "CL" "L4" "L3" "L2" "L1" "N" "R1" "R2"
# [10] "R3" "R4" "CR" "L4_type" "L3_type" "L2_type" "L1_type" "N_type" "R1_type"
# [19] "R2_type" "R3_type" "R4_type" "_L4" "_L3" "_L2" "_L1" "_N" "_R1"
# [28] "_R2" "_R3" "_R4"
and if you want to see the raw data.table
:
print(y, as_data_table = TRUE)
# idx CL L4 L3 L2 L1 N R1 R2 R3 R4 CR L4_type
# 1: 4 <NA> <NA> But do cats eat bats I wonder And here Alice began <NA>
# 2: 34 way, `Do cats eat bats Do cats eat bats and sometimes Do bats eat cats eat
# 3: 40 cats eat bats?' and sometimes Do bats eat cats for you see as she couldn't and
# L3_type L2_type L1_type N_type R1_type R2_type R3_type R4_type _L4 _L3 _L2 _L1 _N _R1 _R2
# 1: but do cats eat bats i wonder and <NA> ,
# 2: bats do cats eat bats and sometimes do ? ?'
# 3: sometimes do bats eat cats for you see , ` ?' ,
# _R3 _R4
# 1: ?'
# 2: , `
# 3: ,
This works just like the old surface
function but now you must pass it corp_text
objects:
<- corp_surface(a_text, span = '1L1R', nodes = c("alice", "bats", "cats")) a_cooccurs
By the way, if you do corp_concordance
on a corp_surface
object it will use the span
, nodes
and collocates
values used to create the corp_surface
object as the default values for corp_concordance
:
corp_concordance(a_cooccurs)
# [1] NA--- do cats eat --- bats, I wonder
# [2] But do cats --- eat bats, I --- wonder?' And here
# [3] I wonder?' And --- here Alice began --- to get rather
# [4] sort of way, `--- Do cats eat --- bats? Do cats
# [5] way, `Do cats --- eat bats? Do --- cats eat bats
# [6] cats eat bats? --- Do cats eat --- bats?' and sometimes
# [7] bats? Do cats --- eat bats?' and --- sometimes, `Do bats
# [8] bats?' and sometimes, `--- Do bats eat --- cats?' for, you
# [9] sometimes, `Do bats --- eat cats?' for, --- you see, as
The same passage after it has been translated into Finnish and back to English using Google translate:
<- "`But cats eat bats, I wonder?' And here, Alice began to get pretty sleepy and went on to say to herself, in a dreamlike way: `Are cats eating bats? Are cats eating bats?' And sometimes, `Do cats have to eat cats?' since you see because he could not answer the questions, he did not really matter how he put it."
b <- corp_text(b)
b_text
<- corp_surface(b_text, span = '1L1R', nodes = c("alice", "bats", "cats"))
b_cooccurs
b_cooccurs# x y H M
# 1: alice began 1 1
# 2: alice here 1 1
# 3: bats and 1 5
# 4: bats are 1 5
# 5: bats eat 1 5
# 6: bats eating 2 4
# 7: bats i 1 5
# 8: cats are 2 8
# 9: cats but 1 9
# 10: cats do 1 9
# 11: cats eat 2 8
# 12: cats eating 2 8
# 13: cats have 1 9
# 14: cats since 1 9
<- corp_coco(a_cooccurs, b_cooccurs, nodes = c("alice", "bats", "cats"), fdr = 1.0) cats_and_bats
plot(cats_and_bats)
Looks like their is a difference for cats and bats but the text is small so we have no power. Let’s have a look at the tokenization:
corp_type_lookup(a_text)
# type tokens
# 1: a a
# 2: alice Alice
# 3: and And, and
# 4: answer answer
# 5: as as
# 6: bats bats
# 7: began began
# 8: but But
# 9: cats cats
# 10: couldn't couldn't
# 11: didn't didn't
# 12: do do, Do
# 13: dreamy dreamy
# 14: eat eat
# 15: either either
# 16: for for
# 17: get get
# 18: here here
# 19: herself herself
# 20: i I
# 21: in in
# 22: it it
# 23: matter matter
# 24: much much
# 25: of of
# 26: on on
# 27: put put
# 28: question question
# 29: rather rather
# 30: saying saying
# 31: see see
# 32: she she
# 33: sleepy sleepy
# 34: sometimes sometimes
# 35: sort sort
# 36: to to
# 37: way way
# 38: went went
# 39: which which
# 40: wonder wonder
# 41: you you
# type tokens
Since cats and bats are both types of mammals maybe we can look at the co-occurences with mammals:
<- corp_get_tokens(a_text)
a_tokens <- corp_get_tokens(b_text)
b_tokens
$type <- sub("^(cats|bats)$", "MAMMALS", a_tokens$type)
a_tokens$type <- sub("^(cats|bats)$", "MAMMALS", b_tokens$type)
b_tokens
<- corp_text(a, tokens = a_tokens)
a_text_mammals # Warning: 'tokens': the 'idx' and 'token' columns are being recalculated
<- corp_text(b, tokens = b_tokens)
b_text_mammals # Warning: 'tokens': the 'idx' and 'token' columns are being recalculated
So now we have a new type to token mapping:
corp_type_lookup(a_text_mammals)
# type tokens
# 1: MAMMALS cats, bats
# 2: a a
# 3: alice Alice
# 4: and And, and
# 5: answer answer
# 6: as as
# 7: began began
# 8: but But
# 9: couldn't couldn't
# 10: didn't didn't
# 11: do do, Do
# 12: dreamy dreamy
# 13: eat eat
# 14: either either
# 15: for for
# 16: get get
# 17: here here
# 18: herself herself
# 19: i I
# 20: in in
# 21: it it
# 22: matter matter
# 23: much much
# 24: of of
# 25: on on
# 26: put put
# 27: question question
# 28: rather rather
# 29: saying saying
# 30: see see
# 31: she she
# 32: sleepy sleepy
# 33: sometimes sometimes
# 34: sort sort
# 35: to to
# 36: way way
# 37: went went
# 38: which which
# 39: wonder wonder
# 40: you you
# type tokens
and so:
<- corp_surface(a_text_mammals, span = '1L1R', nodes = c("alice", "MAMMALS"))
a_cooccurs_mammals <- corp_surface(b_text_mammals, span = '1L1R', nodes = c("alice", "MAMMALS"))
b_cooccurs_mammals
<- corp_coco(a_cooccurs_mammals, b_cooccurs_mammals, nodes = c("alice", "MAMMALS"), fdr = 1.0) mammals
plot(mammals)