Document (#39920)

Author
Cruys, T. van de
Moirón, B.V.
Title
Semantics-based multiword expression extraction
Source
Proceedings of the Workshop on A Broader Perspective on Multiword Expressions, Prag 2007
Imprint
Prag : Association for Computational Linguistics
Year
2007
Pages
S.25-32
Abstract
This paper describes a fully unsupervised and automated method for large-scale extraction of multiword expressions (MWEs) from large corpora. The method aims at capturing the non-compositionality of MWEs; the intuition is that a noun within a MWE cannot easily be replaced by a semantically similar noun. To implement this intuition, a noun clustering is automatically extracted (using distributional similarity measures), which gives us clusters of semantically related nouns. Next, a number of statistical measures - based on selectional preferences - is developed that formalize the intuition of non-compositionality. Our approach has been tested on Dutch, and automatically evaluated using Dutch lexical resources.
Theme
Computerlinguistik

Similar documents (content)

  1. Nagy T., I.: Detecting multiword expressions and named entities in natural language texts (2014) 0.38
    0.38158965 = sum of:
      0.38158965 = product of:
        0.9539741 = sum of:
          0.030905958 = weight(abstract_txt:lexical in 1536) [ClassicSimilarity], result of:
            0.030905958 = score(doc=1536,freq=2.0), product of:
              0.08572219 = queryWeight, product of:
                1.0017344 = boost
                6.5264034 = idf(docFreq=175, maxDocs=44218)
                0.013111935 = queryNorm
              0.36053625 = fieldWeight in 1536, product of:
                1.4142135 = tf(freq=2.0), with freq of:
                  2.0 = termFreq=2.0
                6.5264034 = idf(docFreq=175, maxDocs=44218)
                0.0390625 = fieldNorm(doc=1536)
          0.022385357 = weight(abstract_txt:expression in 1536) [ClassicSimilarity], result of:
            0.022385357 = score(doc=1536,freq=1.0), product of:
              0.087106615 = queryWeight, product of:
                1.009791 = boost
                6.578893 = idf(docFreq=166, maxDocs=44218)
                0.013111935 = queryNorm
              0.25698802 = fieldWeight in 1536, product of:
                1.0 = tf(freq=1.0), with freq of:
                  1.0 = termFreq=1.0
                6.578893 = idf(docFreq=166, maxDocs=44218)
                0.0390625 = fieldNorm(doc=1536)
          0.06965771 = weight(abstract_txt:expressions in 1536) [ClassicSimilarity], result of:
            0.06965771 = score(doc=1536,freq=8.0), product of:
              0.09283063 = queryWeight, product of:
                1.0424412 = boost
                6.7916126 = idf(docFreq=134, maxDocs=44218)
                0.013111935 = queryNorm
              0.75037426 = fieldWeight in 1536, product of:
                2.828427 = tf(freq=8.0), with freq of:
                  8.0 = termFreq=8.0
                6.7916126 = idf(docFreq=134, maxDocs=44218)
                0.0390625 = fieldNorm(doc=1536)
          0.006530367 = weight(abstract_txt:using in 1536) [ClassicSimilarity], result of:
            0.006530367 = score(doc=1536,freq=1.0), product of:
              0.048273653 = queryWeight, product of:
                1.0631046 = boost
                3.4631186 = idf(docFreq=3765, maxDocs=44218)
                0.013111935 = queryNorm
              0.13527808 = fieldWeight in 1536, product of:
                1.0 = tf(freq=1.0), with freq of:
                  1.0 = termFreq=1.0
                3.4631186 = idf(docFreq=3765, maxDocs=44218)
                0.0390625 = fieldNorm(doc=1536)
          0.03205775 = weight(abstract_txt:method in 1536) [ClassicSimilarity], result of:
            0.03205775 = score(doc=1536,freq=5.0), product of:
              0.08154246 = queryWeight, product of:
                1.381697 = boost
                4.50095 = idf(docFreq=1333, maxDocs=44218)
                0.013111935 = queryNorm
              0.3931418 = fieldWeight in 1536, product of:
                2.236068 = tf(freq=5.0), with freq of:
                  5.0 = termFreq=5.0
                4.50095 = idf(docFreq=1333, maxDocs=44218)
                0.0390625 = fieldNorm(doc=1536)
          0.03733609 = weight(abstract_txt:automatically in 1536) [ClassicSimilarity], result of:
            0.03733609 = score(doc=1536,freq=2.0), product of:
              0.12250703 = queryWeight, product of:
                1.6935633 = boost
                5.5168705 = idf(docFreq=482, maxDocs=44218)
                0.013111935 = queryNorm
              0.30476692 = fieldWeight in 1536, product of:
                1.4142135 = tf(freq=2.0), with freq of:
                  2.0 = termFreq=2.0
                5.5168705 = idf(docFreq=482, maxDocs=44218)
                0.0390625 = fieldNorm(doc=1536)
          0.06463912 = weight(abstract_txt:extraction in 1536) [ClassicSimilarity], result of:
            0.06463912 = score(doc=1536,freq=3.0), product of:
              0.15430322 = queryWeight, product of:
                1.9006774 = boost
                6.1915555 = idf(docFreq=245, maxDocs=44218)
                0.013111935 = queryNorm
              0.41890973 = fieldWeight in 1536, product of:
                1.7320508 = tf(freq=3.0), with freq of:
                  3.0 = termFreq=3.0
                6.1915555 = idf(docFreq=245, maxDocs=44218)
                0.0390625 = fieldNorm(doc=1536)
          0.33778185 = weight(abstract_txt:multiword in 1536) [ClassicSimilarity], result of:
            0.33778185 = score(doc=1536,freq=11.0), product of:
              0.30133188 = queryWeight, product of:
                2.6560943 = boost
                8.652365 = idf(docFreq=20, maxDocs=44218)
                0.013111935 = queryNorm
              1.1209629 = fieldWeight in 1536, product of:
                3.3166249 = tf(freq=11.0), with freq of:
                  11.0 = termFreq=11.0
                8.652365 = idf(docFreq=20, maxDocs=44218)
                0.0390625 = fieldNorm(doc=1536)
          0.24225646 = weight(abstract_txt:mwes in 1536) [ClassicSimilarity], result of:
            0.24225646 = score(doc=1536,freq=3.0), product of:
              0.37230164 = queryWeight, product of:
                2.952354 = boost
                9.617446 = idf(docFreq=7, maxDocs=44218)
                0.013111935 = queryNorm
              0.65069944 = fieldWeight in 1536, product of:
                1.7320508 = tf(freq=3.0), with freq of:
                  3.0 = termFreq=3.0
                9.617446 = idf(docFreq=7, maxDocs=44218)
                0.0390625 = fieldNorm(doc=1536)
          0.11042348 = weight(abstract_txt:noun in 1536) [ClassicSimilarity], result of:
            0.11042348 = score(doc=1536,freq=1.0), product of:
              0.36404622 = queryWeight, product of:
                3.5755663 = boost
                7.7650614 = idf(docFreq=50, maxDocs=44218)
                0.013111935 = queryNorm
              0.3033227 = fieldWeight in 1536, product of:
                1.0 = tf(freq=1.0), with freq of:
                  1.0 = termFreq=1.0
                7.7650614 = idf(docFreq=50, maxDocs=44218)
                0.0390625 = fieldNorm(doc=1536)
        0.4 = coord(10/25)
    
  2. Snajder, J.; Almic, P.: Modeling semantic compositionality of Croatian multiword expressions (2015) 0.35
    0.34821835 = sum of:
      0.34821835 = product of:
        1.7410917 = sum of:
          0.059106532 = weight(abstract_txt:expressions in 2920) [ClassicSimilarity], result of:
            0.059106532 = score(doc=2920,freq=1.0), product of:
              0.09283063 = queryWeight, product of:
                1.0424412 = boost
                6.7916126 = idf(docFreq=134, maxDocs=44218)
                0.013111935 = queryNorm
              0.6367137 = fieldWeight in 2920, product of:
                1.0 = tf(freq=1.0), with freq of:
                  1.0 = termFreq=1.0
                6.7916126 = idf(docFreq=134, maxDocs=44218)
                0.09375 = fieldNorm(doc=2920)
          0.15642649 = weight(abstract_txt:distributional in 2920) [ClassicSimilarity], result of:
            0.15642649 = score(doc=2920,freq=1.0), product of:
              0.1776129 = queryWeight, product of:
                1.4419267 = boost
                9.394302 = idf(docFreq=9, maxDocs=44218)
                0.013111935 = queryNorm
              0.88071585 = fieldWeight in 2920, product of:
                1.0 = tf(freq=1.0), with freq of:
                  1.0 = termFreq=1.0
                9.394302 = idf(docFreq=9, maxDocs=44218)
                0.09375 = fieldNorm(doc=2920)
          0.24442813 = weight(abstract_txt:multiword in 2920) [ClassicSimilarity], result of:
            0.24442813 = score(doc=2920,freq=1.0), product of:
              0.30133188 = queryWeight, product of:
                2.6560943 = boost
                8.652365 = idf(docFreq=20, maxDocs=44218)
                0.013111935 = queryNorm
              0.8111592 = fieldWeight in 2920, product of:
                1.0 = tf(freq=1.0), with freq of:
                  1.0 = termFreq=1.0
                8.652365 = idf(docFreq=20, maxDocs=44218)
                0.09375 = fieldNorm(doc=2920)
          0.58141553 = weight(abstract_txt:mwes in 2920) [ClassicSimilarity], result of:
            0.58141553 = score(doc=2920,freq=3.0), product of:
              0.37230164 = queryWeight, product of:
                2.952354 = boost
                9.617446 = idf(docFreq=7, maxDocs=44218)
                0.013111935 = queryNorm
              1.5616786 = fieldWeight in 2920, product of:
                1.7320508 = tf(freq=3.0), with freq of:
                  3.0 = termFreq=3.0
                9.617446 = idf(docFreq=7, maxDocs=44218)
                0.09375 = fieldNorm(doc=2920)
          0.699715 = weight(abstract_txt:compositionality in 2920) [ClassicSimilarity], result of:
            0.699715 = score(doc=2920,freq=4.0), product of:
              0.3827117 = queryWeight, product of:
                2.9933453 = boost
                9.7509775 = idf(docFreq=6, maxDocs=44218)
                0.013111935 = queryNorm
              1.8283083 = fieldWeight in 2920, product of:
                2.0 = tf(freq=4.0), with freq of:
                  4.0 = termFreq=4.0
                9.7509775 = idf(docFreq=6, maxDocs=44218)
                0.09375 = fieldNorm(doc=2920)
        0.2 = coord(5/25)
    
  3. Nissim, M.; Zaninello, A,: Modeling the internal variability of multiword expressions through a pattern-based method (2013) 0.27
    0.26876476 = sum of:
      0.26876476 = product of:
        0.8398899 = sum of:
          0.039404355 = weight(abstract_txt:expressions in 990) [ClassicSimilarity], result of:
            0.039404355 = score(doc=990,freq=1.0), product of:
              0.09283063 = queryWeight, product of:
                1.0424412 = boost
                6.7916126 = idf(docFreq=134, maxDocs=44218)
                0.013111935 = queryNorm
              0.4244758 = fieldWeight in 990, product of:
                1.0 = tf(freq=1.0), with freq of:
                  1.0 = termFreq=1.0
                6.7916126 = idf(docFreq=134, maxDocs=44218)
                0.0625 = fieldNorm(doc=990)
          0.010448586 = weight(abstract_txt:using in 990) [ClassicSimilarity], result of:
            0.010448586 = score(doc=990,freq=1.0), product of:
              0.048273653 = queryWeight, product of:
                1.0631046 = boost
                3.4631186 = idf(docFreq=3765, maxDocs=44218)
                0.013111935 = queryNorm
              0.21644491 = fieldWeight in 990, product of:
                1.0 = tf(freq=1.0), with freq of:
                  1.0 = termFreq=1.0
                3.4631186 = idf(docFreq=3765, maxDocs=44218)
                0.0625 = fieldNorm(doc=990)
          0.022229627 = weight(abstract_txt:large in 990) [ClassicSimilarity], result of:
            0.022229627 = score(doc=990,freq=1.0), product of:
              0.07985337 = queryWeight, product of:
                1.3673118 = boost
                4.454089 = idf(docFreq=1397, maxDocs=44218)
                0.013111935 = queryNorm
              0.27838057 = fieldWeight in 990, product of:
                1.0 = tf(freq=1.0), with freq of:
                  1.0 = termFreq=1.0
                4.454089 = idf(docFreq=1397, maxDocs=44218)
                0.0625 = fieldNorm(doc=990)
          0.03244016 = weight(abstract_txt:method in 990) [ClassicSimilarity], result of:
            0.03244016 = score(doc=990,freq=2.0), product of:
              0.08154246 = queryWeight, product of:
                1.381697 = boost
                4.50095 = idf(docFreq=1333, maxDocs=44218)
                0.013111935 = queryNorm
              0.3978315 = fieldWeight in 990, product of:
                1.4142135 = tf(freq=2.0), with freq of:
                  2.0 = termFreq=2.0
                4.50095 = idf(docFreq=1333, maxDocs=44218)
                0.0625 = fieldNorm(doc=990)
          0.040396985 = weight(abstract_txt:measures in 990) [ClassicSimilarity], result of:
            0.040396985 = score(doc=990,freq=1.0), product of:
              0.118915305 = queryWeight, product of:
                1.6685523 = boost
                5.4353957 = idf(docFreq=523, maxDocs=44218)
                0.013111935 = queryNorm
              0.33971223 = fieldWeight in 990, product of:
                1.0 = tf(freq=1.0), with freq of:
                  1.0 = termFreq=1.0
                5.4353957 = idf(docFreq=523, maxDocs=44218)
                0.0625 = fieldNorm(doc=990)
          0.08444419 = weight(abstract_txt:extraction in 990) [ClassicSimilarity], result of:
            0.08444419 = score(doc=990,freq=2.0), product of:
              0.15430322 = queryWeight, product of:
                1.9006774 = boost
                6.1915555 = idf(docFreq=245, maxDocs=44218)
                0.013111935 = queryNorm
              0.54726136 = fieldWeight in 990, product of:
                1.4142135 = tf(freq=2.0), with freq of:
                  2.0 = termFreq=2.0
                6.1915555 = idf(docFreq=245, maxDocs=44218)
                0.0625 = fieldNorm(doc=990)
          0.16295208 = weight(abstract_txt:multiword in 990) [ClassicSimilarity], result of:
            0.16295208 = score(doc=990,freq=1.0), product of:
              0.30133188 = queryWeight, product of:
                2.6560943 = boost
                8.652365 = idf(docFreq=20, maxDocs=44218)
                0.013111935 = queryNorm
              0.5407728 = fieldWeight in 990, product of:
                1.0 = tf(freq=1.0), with freq of:
                  1.0 = termFreq=1.0
                8.652365 = idf(docFreq=20, maxDocs=44218)
                0.0625 = fieldNorm(doc=990)
          0.44757387 = weight(abstract_txt:mwes in 990) [ClassicSimilarity], result of:
            0.44757387 = score(doc=990,freq=4.0), product of:
              0.37230164 = queryWeight, product of:
                2.952354 = boost
                9.617446 = idf(docFreq=7, maxDocs=44218)
                0.013111935 = queryNorm
              1.2021807 = fieldWeight in 990, product of:
                2.0 = tf(freq=4.0), with freq of:
                  4.0 = termFreq=4.0
                9.617446 = idf(docFreq=7, maxDocs=44218)
                0.0625 = fieldNorm(doc=990)
        0.32 = coord(8/25)
    
  4. Vechtomova, O.: ¬A method for automatic extraction of multiword units representing business aspects from user reviews (2014) 0.17
    0.17400274 = sum of:
      0.17400274 = product of:
        0.7250114 = sum of:
          0.013060734 = weight(abstract_txt:using in 1304) [ClassicSimilarity], result of:
            0.013060734 = score(doc=1304,freq=1.0), product of:
              0.048273653 = queryWeight, product of:
                1.0631046 = boost
                3.4631186 = idf(docFreq=3765, maxDocs=44218)
                0.013111935 = queryNorm
              0.27055615 = fieldWeight in 1304, product of:
                1.0 = tf(freq=1.0), with freq of:
                  1.0 = termFreq=1.0
                3.4631186 = idf(docFreq=3765, maxDocs=44218)
                0.078125 = fieldNorm(doc=1304)
          0.04966365 = weight(abstract_txt:method in 1304) [ClassicSimilarity], result of:
            0.04966365 = score(doc=1304,freq=3.0), product of:
              0.08154246 = queryWeight, product of:
                1.381697 = boost
                4.50095 = idf(docFreq=1333, maxDocs=44218)
                0.013111935 = queryNorm
              0.60905266 = fieldWeight in 1304, product of:
                1.7320508 = tf(freq=3.0), with freq of:
                  3.0 = termFreq=3.0
                4.50095 = idf(docFreq=1333, maxDocs=44218)
                0.078125 = fieldNorm(doc=1304)
          0.1843504 = weight(abstract_txt:distributional in 1304) [ClassicSimilarity], result of:
            0.1843504 = score(doc=1304,freq=2.0), product of:
              0.1776129 = queryWeight, product of:
                1.4419267 = boost
                9.394302 = idf(docFreq=9, maxDocs=44218)
                0.013111935 = queryNorm
              1.0379336 = fieldWeight in 1304, product of:
                1.4142135 = tf(freq=2.0), with freq of:
                  2.0 = termFreq=2.0
                9.394302 = idf(docFreq=9, maxDocs=44218)
                0.078125 = fieldNorm(doc=1304)
          0.05049623 = weight(abstract_txt:measures in 1304) [ClassicSimilarity], result of:
            0.05049623 = score(doc=1304,freq=1.0), product of:
              0.118915305 = queryWeight, product of:
                1.6685523 = boost
                5.4353957 = idf(docFreq=523, maxDocs=44218)
                0.013111935 = queryNorm
              0.4246403 = fieldWeight in 1304, product of:
                1.0 = tf(freq=1.0), with freq of:
                  1.0 = termFreq=1.0
                5.4353957 = idf(docFreq=523, maxDocs=44218)
                0.078125 = fieldNorm(doc=1304)
          0.07463882 = weight(abstract_txt:extraction in 1304) [ClassicSimilarity], result of:
            0.07463882 = score(doc=1304,freq=1.0), product of:
              0.15430322 = queryWeight, product of:
                1.9006774 = boost
                6.1915555 = idf(docFreq=245, maxDocs=44218)
                0.013111935 = queryNorm
              0.48371527 = fieldWeight in 1304, product of:
                1.0 = tf(freq=1.0), with freq of:
                  1.0 = termFreq=1.0
                6.1915555 = idf(docFreq=245, maxDocs=44218)
                0.078125 = fieldNorm(doc=1304)
          0.3528016 = weight(abstract_txt:multiword in 1304) [ClassicSimilarity], result of:
            0.3528016 = score(doc=1304,freq=3.0), product of:
              0.30133188 = queryWeight, product of:
                2.6560943 = boost
                8.652365 = idf(docFreq=20, maxDocs=44218)
                0.013111935 = queryNorm
              1.1708074 = fieldWeight in 1304, product of:
                1.7320508 = tf(freq=3.0), with freq of:
                  3.0 = termFreq=3.0
                8.652365 = idf(docFreq=20, maxDocs=44218)
                0.078125 = fieldNorm(doc=1304)
        0.24 = coord(6/25)
    
  5. Ramisch, C.; Schreiner, P.; Idiart, M.; Villavicencio, A.: ¬An evaluation of methods for the extraction of multiword expressions (20xx) 0.16
    0.15658605 = sum of:
      0.15658605 = product of:
        0.97866285 = sum of:
          0.06895762 = weight(abstract_txt:expressions in 962) [ClassicSimilarity], result of:
            0.06895762 = score(doc=962,freq=1.0), product of:
              0.09283063 = queryWeight, product of:
                1.0424412 = boost
                6.7916126 = idf(docFreq=134, maxDocs=44218)
                0.013111935 = queryNorm
              0.74283266 = fieldWeight in 962, product of:
                1.0 = tf(freq=1.0), with freq of:
                  1.0 = termFreq=1.0
                6.7916126 = idf(docFreq=134, maxDocs=44218)
                0.109375 = fieldNorm(doc=962)
          0.07069472 = weight(abstract_txt:measures in 962) [ClassicSimilarity], result of:
            0.07069472 = score(doc=962,freq=1.0), product of:
              0.118915305 = queryWeight, product of:
                1.6685523 = boost
                5.4353957 = idf(docFreq=523, maxDocs=44218)
                0.013111935 = queryNorm
              0.5944964 = fieldWeight in 962, product of:
                1.0 = tf(freq=1.0), with freq of:
                  1.0 = termFreq=1.0
                5.4353957 = idf(docFreq=523, maxDocs=44218)
                0.109375 = fieldNorm(doc=962)
          0.28516614 = weight(abstract_txt:multiword in 962) [ClassicSimilarity], result of:
            0.28516614 = score(doc=962,freq=1.0), product of:
              0.30133188 = queryWeight, product of:
                2.6560943 = boost
                8.652365 = idf(docFreq=20, maxDocs=44218)
                0.013111935 = queryNorm
              0.94635236 = fieldWeight in 962, product of:
                1.0 = tf(freq=1.0), with freq of:
                  1.0 = termFreq=1.0
                8.652365 = idf(docFreq=20, maxDocs=44218)
                0.109375 = fieldNorm(doc=962)
          0.5538444 = weight(abstract_txt:mwes in 962) [ClassicSimilarity], result of:
            0.5538444 = score(doc=962,freq=2.0), product of:
              0.37230164 = queryWeight, product of:
                2.952354 = boost
                9.617446 = idf(docFreq=7, maxDocs=44218)
                0.013111935 = queryNorm
              1.4876227 = fieldWeight in 962, product of:
                1.4142135 = tf(freq=2.0), with freq of:
                  2.0 = termFreq=2.0
                9.617446 = idf(docFreq=7, maxDocs=44218)
                0.109375 = fieldNorm(doc=962)
        0.16 = coord(4/25)