Welcome to cefrpy’s documentation!

Indices and tables

About cefrpy

The cefrpy python module offers a comprehensive toolkit for analyzing linguistic data based on the Common European Framework of Reference for Languages (CEFR).

Installation

You can install cefrpy for Python >= 3.6 via pip:

pip install cefrpy

Usage examples

Getting part of speech level of a word

from cefrpy import CEFRAnalyzer

analyzer = CEFRAnalyzer()

word = "happy"
pos_tag = "JJ"  # Adjective
level = analyzer.get_word_pos_level_float(word, pos_tag)
if level is not None:
    print(f"The level of '{word}' as a {pos_tag} is: {level}")
else:
    print(f"Level not found for '{word}' a {pos_tag}.")


# You can also get the level in CEFR format
cefr_level = analyzer.get_word_pos_level_CEFR(word, pos_tag)
if cefr_level is not None:
    print(f"The CEFR level of '{word}' as a {pos_tag} is: {cefr_level}")
else:
    print(f"CEFR level not found for '{word}' as a {pos_tag}.")

Getting average level of a word:

from cefrpy import CEFRAnalyzer

analyzer = CEFRAnalyzer()

word = "supremacy"
average_level = analyzer.get_average_word_level_float(word)
if average_level is not None:
    print(f"The average level of '{word}' is: {average_level}")
else:
    print(f"Average level not found for '{word}'.")


# You can also get the average level in CEFR format
cefr_average_level = analyzer.get_average_word_level_CEFR(word)
if cefr_average_level is not None:
    print(f"The CEFR average level of '{word}' is: {cefr_average_level}")
else:
    print(f"CEFR average level not found for '{word}'.")

Additional features

Get all possible part-of-speech tags for a word

from cefrpy import CEFRAnalyzer

analyzer = CEFRAnalyzer()

print(analyzer.get_all_pos_for_word("test")) # [<POSTag.JJ: 4>, <POSTag.NN: 8>, <POSTag.VB: 19>]
print(analyzer.get_all_pos_for_word_as_str("test")) # ['JJ', 'NN']

# {<POSTag.JJ: 4>: <CEFRLevel.A2: 2>, <POSTag.NN: 8>: <CEFRLevel.A1: 1>, <POSTag.VB: 19>: <CEFRLevel.B2: 4>}
print(analyzer.get_pos_level_dict_for_word("test"))

# {'JJ': 2.5, 'NN': 1.0, 'VB': 4.0}
print(analyzer.get_pos_level_dict_for_word("test", pos_tag_as_string=True, word_level_as_float=True))

Checking if a word exists in the database

from cefrpy import CEFRAnalyzer

analyzer = CEFRAnalyzer()

word = "apple"
if analyzer.is_word_in_database(word):
    print(f"'{word}' exists in the database.")
else:
    print(f"'{word}' does not exist in the database.")

Checking if a word with a specific part-of-speech exists in the database

from cefrpy import CEFRAnalyzer

analyzer = CEFRAnalyzer()

word = "run"
pos_tag = "VB"  # Verb
if analyzer.is_word_pos_id_database(word, pos_tag):
    print(f"'{word}' with part of speech '{pos_tag}' exists in the database.")
else:
    print(f"'{word}' with part of speech '{pos_tag}' does not exist in the database.")

POSTag usage examples

from cefrpy import POSTag

# Get list of all part-of-speech tag names
print(POSTag.get_all_tags()) # ['CC', 'CD', 'DT', ...]

# Print total tags
print(POSTag.get_total_tags()) # 28

# Get description for a tag
print(POSTag.get_description_by_tag_name('NN')) # Noun, singular or mass

tag = POSTag.VB
print(tag)                          # VB
print(POSTag.get_description(tag))  # Verb, base form
print(int(tag))                     # 19 (unique tag id)
print(tag == POSTag.NN)             # False

CEFRLevel usage examples

from cefrpy import CEFRLevel

level = CEFRLevel.A1
print(level)            # A1
print(int(level))       # 1

level2 = CEFRLevel.C2
print(level2)           # C2
print(int(level2))      # 6

# You can perform any comparisons:
print(level2 > level)   # True
print(level2 == level)  # False

print(CEFRLevel.from_str("B1") == CEFRLevel.B1) # True
print(CEFRLevel.from_str("B1") == CEFRLevel(3)) # True

Yields CEFRAnalyzer methods

For every example you should import and initialize CEFRAnalyzer:

from cefrpy import CEFRAnalyzer

analyzer = CEFRAnalyzer()

Iterating over words with a specific length (alphabetical order)

iteration_limit = 10
word_list = []
for word in analyzer.yield_words_with_length(6):
    if iteration_limit == 0:
        break
    word_list.append(word)
    iteration_limit -= 1

# ['aaberg', 'aachen', 'aahing', 'aargau', 'aarhus', 'abacus', 'abadan', 'abadia', 'abakan', 'abaris']
print(word_list)

Iterating over words with a specific length (reversed alphabetical order)

iteration_limit = 10
word_list = []
for word in analyzer.yield_words_with_length(6, reverse_order=True):
    if iteration_limit == 0:
        break
    word_list.append(word)
    iteration_limit -= 1

# ['zymase', 'zygote', 'zygoma', 'zydeco', 'zwolle', 'zwicky', 'zuzana', 'zusman', 'zurvan', 'zurich']
print(word_list)

Iterating over words in alphabetical order

iteration_limit = 10
word_list = []
for word in analyzer.yield_words():
    if iteration_limit == 0:
        break
    word_list.append(word)
    iteration_limit -= 1

# ['a', 'aa', 'aaa', 'aaaa', 'aaas', 'aaberg', 'aachen', 'aae', 'aaee', 'aaf']
print(word_list)

Iterating over words with their pos in alphabetical order with word length priority ascending

iteration_limit = 6
word_pos_list = []
for word, pos_tag in analyzer.yield_word_pos(word_length_sort=True):
    if iteration_limit == 0:
        break
    word_pos_list.append((word, pos_tag))
    iteration_limit -= 1

# [('a', <POSTag.DT: 2>), ('a', <POSTag.IN: 3>), ('a', <POSTag.JJ: 4>), ('a', <POSTag.NN: 8>), ('a', <POSTag.VB: 19>), ('b', <POSTag.JJ: 4>)]
print(word_pos_list)

Iterating over words with their pos as str and levels as float in reversed alphabetical order with word length priority descending

iteration_limit = 3
word_pos_list = []
for word, pos_tag, level in analyzer.yield_word_pos_level(word_length_sort=True, reverse_order=True, pos_tag_as_string=True, word_level_as_float=True):
    if iteration_limit == 0:
        break
    word_pos_list.append((word, pos_tag, level))
    iteration_limit -= 1

# [('demethylchlortetracycline', 'NN', 6.0), ('electrocardiographically', 'RB', 6.0), ('polytetrafluoroethylene', 'NN', 6.0)]
print(word_pos_list)

License

This project is licensed under the MIT License - see the LICENSE file for details.

Acknowledgments

I would like to acknowledge the contributions of the following resources. I used them to create my initial SQLite version Words-CEFR-Dataset:

Also I used these resources to create my valid English words list: