Document (#39921)

Author
Cruys, T. van de
Moirón, B.V.
Title
Semantics-based multiword expression extraction
Source
Proceedings of the Workshop on A Broader Perspective on Multiword Expressions, Prag 2007
Imprint
Prag : Association for Computational Linguistics
Year
2007
Pages
S.25-32
Abstract
This paper describes a fully unsupervised and automated method for large-scale extraction of multiword expressions (MWEs) from large corpora. The method aims at capturing the non-compositionality of MWEs; the intuition is that a noun within a MWE cannot easily be replaced by a semantically similar noun. To implement this intuition, a noun clustering is automatically extracted (using distributional similarity measures), which gives us clusters of semantically related nouns. Next, a number of statistical measures - based on selectional preferences - is developed that formalize the intuition of non-compositionality. Our approach has been tested on Dutch, and automatically evaluated using Dutch lexical resources.
Theme
Computerlinguistik

Similar documents (content)

  1. Nagy T., I.: Detecting multiword expressions and named entities in natural language texts (2014) 0.38
    0.3813432 = sum of:
      0.3813432 = product of:
        0.95335793 = sum of:
          0.03118632 = weight(abstract_txt:lexical in 3537) [ClassicSimilarity], result of:
            0.03118632 = score(doc=3537,freq=2.0), product of:
              0.0862554 = queryWeight, product of:
                6.5448966 = idf(docFreq=166, maxDocs=42740)
                0.013179032 = queryNorm
              0.36155787 = fieldWeight in 3537, product of:
                1.4142135 = tf(freq=2.0), with freq of:
                  2.0 = termFreq=2.0
                6.5448966 = idf(docFreq=166, maxDocs=42740)
                0.0390625 = fieldNorm(doc=3537)
          0.022682117 = weight(abstract_txt:expression in 3537) [ClassicSimilarity], result of:
            0.022682117 = score(doc=3537,freq=1.0), product of:
              0.08789064 = queryWeight, product of:
                1.0094346 = boost
                6.6066446 = idf(docFreq=156, maxDocs=42740)
                0.013179032 = queryNorm
              0.25807205 = fieldWeight in 3537, product of:
                1.0 = tf(freq=1.0), with freq of:
                  1.0 = termFreq=1.0
                6.6066446 = idf(docFreq=156, maxDocs=42740)
                0.0390625 = fieldNorm(doc=3537)
          0.07127725 = weight(abstract_txt:expressions in 3537) [ClassicSimilarity], result of:
            0.07127725 = score(doc=3537,freq=8.0), product of:
              0.094281 = queryWeight, product of:
                1.0454878 = boost
                6.842609 = idf(docFreq=123, maxDocs=42740)
                0.013179032 = queryNorm
              0.7560086 = fieldWeight in 3537, product of:
                2.828427 = tf(freq=8.0), with freq of:
                  8.0 = termFreq=8.0
                6.842609 = idf(docFreq=123, maxDocs=42740)
                0.0390625 = fieldNorm(doc=3537)
          0.0066270265 = weight(abstract_txt:using in 3537) [ClassicSimilarity], result of:
            0.0066270265 = score(doc=3537,freq=1.0), product of:
              0.048757643 = queryWeight, product of:
                1.0632691 = boost
                3.4794931 = idf(docFreq=3580, maxDocs=42740)
                0.013179032 = queryNorm
              0.1359177 = fieldWeight in 3537, product of:
                1.0 = tf(freq=1.0), with freq of:
                  1.0 = termFreq=1.0
                3.4794931 = idf(docFreq=3580, maxDocs=42740)
                0.0390625 = fieldNorm(doc=3537)
          0.03251961 = weight(abstract_txt:method in 3537) [ClassicSimilarity], result of:
            0.03251961 = score(doc=3537,freq=5.0), product of:
              0.08233865 = queryWeight, product of:
                1.3817317 = boost
                4.5216455 = idf(docFreq=1262, maxDocs=42740)
                0.013179032 = queryNorm
              0.3949495 = fieldWeight in 3537, product of:
                2.236068 = tf(freq=5.0), with freq of:
                  5.0 = termFreq=5.0
                4.5216455 = idf(docFreq=1262, maxDocs=42740)
                0.0390625 = fieldNorm(doc=3537)
          0.037263308 = weight(abstract_txt:automatically in 3537) [ClassicSimilarity], result of:
            0.037263308 = score(doc=3537,freq=2.0), product of:
              0.12236986 = queryWeight, product of:
                1.6844537 = boost
                5.5122876 = idf(docFreq=468, maxDocs=42740)
                0.013179032 = queryNorm
              0.30451375 = fieldWeight in 3537, product of:
                1.4142135 = tf(freq=2.0), with freq of:
                  2.0 = termFreq=2.0
                5.5122876 = idf(docFreq=468, maxDocs=42740)
                0.0390625 = fieldNorm(doc=3537)
          0.065448016 = weight(abstract_txt:extraction in 3537) [ClassicSimilarity], result of:
            0.065448016 = score(doc=3537,freq=3.0), product of:
              0.15561596 = queryWeight, product of:
                1.8995421 = boost
                6.216153 = idf(docFreq=231, maxDocs=42740)
                0.013179032 = queryNorm
              0.42057395 = fieldWeight in 3537, product of:
                1.7320508 = tf(freq=3.0), with freq of:
                  3.0 = termFreq=3.0
                6.216153 = idf(docFreq=231, maxDocs=42740)
                0.0390625 = fieldNorm(doc=3537)
          0.33399677 = weight(abstract_txt:multiword in 3537) [ClassicSimilarity], result of:
            0.33399677 = score(doc=3537,freq=11.0), product of:
              0.2991306 = queryWeight, product of:
                2.6336148 = boost
                8.618368 = idf(docFreq=20, maxDocs=42740)
                0.013179032 = queryNorm
              1.1165584 = fieldWeight in 3537, product of:
                3.3166249 = tf(freq=11.0), with freq of:
                  11.0 = termFreq=11.0
                8.618368 = idf(docFreq=20, maxDocs=42740)
                0.0390625 = fieldNorm(doc=3537)
          0.23982637 = weight(abstract_txt:mwes in 3537) [ClassicSimilarity], result of:
            0.23982637 = score(doc=3537,freq=3.0), product of:
              0.36987454 = queryWeight, product of:
                2.928526 = boost
                9.583449 = idf(docFreq=7, maxDocs=42740)
                0.013179032 = queryNorm
              0.64839923 = fieldWeight in 3537, product of:
                1.7320508 = tf(freq=3.0), with freq of:
                  3.0 = termFreq=3.0
                9.583449 = idf(docFreq=7, maxDocs=42740)
                0.0390625 = fieldNorm(doc=3537)
          0.11253111 = weight(abstract_txt:noun in 3537) [ClassicSimilarity], result of:
            0.11253111 = score(doc=3537,freq=1.0), product of:
              0.3687305 = queryWeight, product of:
                3.5811458 = boost
                7.8127427 = idf(docFreq=46, maxDocs=42740)
                0.013179032 = queryNorm
              0.30518526 = fieldWeight in 3537, product of:
                1.0 = tf(freq=1.0), with freq of:
                  1.0 = termFreq=1.0
                7.8127427 = idf(docFreq=46, maxDocs=42740)
                0.0390625 = fieldNorm(doc=3537)
        0.4 = coord(10/25)
    
  2. Snajder, J.; Almic, P.: Modeling semantic compositionality of Croatian multiword expressions (2015) 0.35
    0.34507364 = sum of:
      0.34507364 = product of:
        1.7253681 = sum of:
          0.06048075 = weight(abstract_txt:expressions in 4921) [ClassicSimilarity], result of:
            0.06048075 = score(doc=4921,freq=1.0), product of:
              0.094281 = queryWeight, product of:
                1.0454878 = boost
                6.842609 = idf(docFreq=123, maxDocs=42740)
                0.013179032 = queryNorm
              0.6414946 = fieldWeight in 4921, product of:
                1.0 = tf(freq=1.0), with freq of:
                  1.0 = termFreq=1.0
                6.842609 = idf(docFreq=123, maxDocs=42740)
                0.09375 = fieldNorm(doc=4921)
          0.15481824 = weight(abstract_txt:distributional in 4921) [ClassicSimilarity], result of:
            0.15481824 = score(doc=4921,freq=1.0), product of:
              0.17642528 = queryWeight, product of:
                1.4301686 = boost
                9.360306 = idf(docFreq=9, maxDocs=42740)
                0.013179032 = queryNorm
              0.87752867 = fieldWeight in 4921, product of:
                1.0 = tf(freq=1.0), with freq of:
                  1.0 = termFreq=1.0
                9.360306 = idf(docFreq=9, maxDocs=42740)
                0.09375 = fieldNorm(doc=4921)
          0.24168915 = weight(abstract_txt:multiword in 4921) [ClassicSimilarity], result of:
            0.24168915 = score(doc=4921,freq=1.0), product of:
              0.2991306 = queryWeight, product of:
                2.6336148 = boost
                8.618368 = idf(docFreq=20, maxDocs=42740)
                0.013179032 = queryNorm
              0.807972 = fieldWeight in 4921, product of:
                1.0 = tf(freq=1.0), with freq of:
                  1.0 = termFreq=1.0
                8.618368 = idf(docFreq=20, maxDocs=42740)
                0.09375 = fieldNorm(doc=4921)
          0.5755832 = weight(abstract_txt:mwes in 4921) [ClassicSimilarity], result of:
            0.5755832 = score(doc=4921,freq=3.0), product of:
              0.36987454 = queryWeight, product of:
                2.928526 = boost
                9.583449 = idf(docFreq=7, maxDocs=42740)
                0.013179032 = queryNorm
              1.5561581 = fieldWeight in 4921, product of:
                1.7320508 = tf(freq=3.0), with freq of:
                  3.0 = termFreq=3.0
                9.583449 = idf(docFreq=7, maxDocs=42740)
                0.09375 = fieldNorm(doc=4921)
          0.6927968 = weight(abstract_txt:compositionality in 4921) [ClassicSimilarity], result of:
            0.6927968 = score(doc=4921,freq=4.0), product of:
              0.38025358 = queryWeight, product of:
                2.9693303 = boost
                9.71698 = idf(docFreq=6, maxDocs=42740)
                0.013179032 = queryNorm
              1.8219337 = fieldWeight in 4921, product of:
                2.0 = tf(freq=4.0), with freq of:
                  4.0 = termFreq=4.0
                9.71698 = idf(docFreq=6, maxDocs=42740)
                0.09375 = fieldNorm(doc=4921)
        0.2 = coord(5/25)
    
  3. Nissim, M.; Zaninello, A,: Modeling the internal variability of multiword expressions through a pattern-based method (2013) 0.27
    0.26761875 = sum of:
      0.26761875 = product of:
        0.8363086 = sum of:
          0.0403205 = weight(abstract_txt:expressions in 2991) [ClassicSimilarity], result of:
            0.0403205 = score(doc=2991,freq=1.0), product of:
              0.094281 = queryWeight, product of:
                1.0454878 = boost
                6.842609 = idf(docFreq=123, maxDocs=42740)
                0.013179032 = queryNorm
              0.42766306 = fieldWeight in 2991, product of:
                1.0 = tf(freq=1.0), with freq of:
                  1.0 = termFreq=1.0
                6.842609 = idf(docFreq=123, maxDocs=42740)
                0.0625 = fieldNorm(doc=2991)
          0.010603243 = weight(abstract_txt:using in 2991) [ClassicSimilarity], result of:
            0.010603243 = score(doc=2991,freq=1.0), product of:
              0.048757643 = queryWeight, product of:
                1.0632691 = boost
                3.4794931 = idf(docFreq=3580, maxDocs=42740)
                0.013179032 = queryNorm
              0.21746832 = fieldWeight in 2991, product of:
                1.0 = tf(freq=1.0), with freq of:
                  1.0 = termFreq=1.0
                3.4794931 = idf(docFreq=3580, maxDocs=42740)
                0.0625 = fieldNorm(doc=2991)
          0.02245756 = weight(abstract_txt:large in 2991) [ClassicSimilarity], result of:
            0.02245756 = score(doc=2991,freq=1.0), product of:
              0.08041282 = queryWeight, product of:
                1.3654773 = boost
                4.468454 = idf(docFreq=1331, maxDocs=42740)
                0.013179032 = queryNorm
              0.27927837 = fieldWeight in 2991, product of:
                1.0 = tf(freq=1.0), with freq of:
                  1.0 = termFreq=1.0
                4.468454 = idf(docFreq=1331, maxDocs=42740)
                0.0625 = fieldNorm(doc=2991)
          0.03290753 = weight(abstract_txt:method in 2991) [ClassicSimilarity], result of:
            0.03290753 = score(doc=2991,freq=2.0), product of:
              0.08233865 = queryWeight, product of:
                1.3817317 = boost
                4.5216455 = idf(docFreq=1262, maxDocs=42740)
                0.013179032 = queryNorm
              0.39966077 = fieldWeight in 2991, product of:
                1.4142135 = tf(freq=2.0), with freq of:
                  2.0 = termFreq=2.0
                4.5216455 = idf(docFreq=1262, maxDocs=42740)
                0.0625 = fieldNorm(doc=2991)
          0.040308475 = weight(abstract_txt:measures in 2991) [ClassicSimilarity], result of:
            0.040308475 = score(doc=2991,freq=1.0), product of:
              0.11876299 = queryWeight, product of:
                1.6594433 = boost
                5.4304423 = idf(docFreq=508, maxDocs=42740)
                0.013179032 = queryNorm
              0.33940265 = fieldWeight in 2991, product of:
                1.0 = tf(freq=1.0), with freq of:
                  1.0 = termFreq=1.0
                5.4304423 = idf(docFreq=508, maxDocs=42740)
                0.0625 = fieldNorm(doc=2991)
          0.08550093 = weight(abstract_txt:extraction in 2991) [ClassicSimilarity], result of:
            0.08550093 = score(doc=2991,freq=2.0), product of:
              0.15561596 = queryWeight, product of:
                1.8995421 = boost
                6.216153 = idf(docFreq=231, maxDocs=42740)
                0.013179032 = queryNorm
              0.5494355 = fieldWeight in 2991, product of:
                1.4142135 = tf(freq=2.0), with freq of:
                  2.0 = termFreq=2.0
                6.216153 = idf(docFreq=231, maxDocs=42740)
                0.0625 = fieldNorm(doc=2991)
          0.16112609 = weight(abstract_txt:multiword in 2991) [ClassicSimilarity], result of:
            0.16112609 = score(doc=2991,freq=1.0), product of:
              0.2991306 = queryWeight, product of:
                2.6336148 = boost
                8.618368 = idf(docFreq=20, maxDocs=42740)
                0.013179032 = queryNorm
              0.538648 = fieldWeight in 2991, product of:
                1.0 = tf(freq=1.0), with freq of:
                  1.0 = termFreq=1.0
                8.618368 = idf(docFreq=20, maxDocs=42740)
                0.0625 = fieldNorm(doc=2991)
          0.44308424 = weight(abstract_txt:mwes in 2991) [ClassicSimilarity], result of:
            0.44308424 = score(doc=2991,freq=4.0), product of:
              0.36987454 = queryWeight, product of:
                2.928526 = boost
                9.583449 = idf(docFreq=7, maxDocs=42740)
                0.013179032 = queryNorm
              1.1979312 = fieldWeight in 2991, product of:
                2.0 = tf(freq=4.0), with freq of:
                  4.0 = termFreq=4.0
                9.583449 = idf(docFreq=7, maxDocs=42740)
                0.0625 = fieldNorm(doc=2991)
        0.32 = coord(8/25)
    
  4. Vechtomova, O.: ¬A method for automatic extraction of multiword units representing business aspects from user reviews (2014) 0.17
    0.17301477 = sum of:
      0.17301477 = product of:
        0.72089493 = sum of:
          0.013254053 = weight(abstract_txt:using in 3305) [ClassicSimilarity], result of:
            0.013254053 = score(doc=3305,freq=1.0), product of:
              0.048757643 = queryWeight, product of:
                1.0632691 = boost
                3.4794931 = idf(docFreq=3580, maxDocs=42740)
                0.013179032 = queryNorm
              0.2718354 = fieldWeight in 3305, product of:
                1.0 = tf(freq=1.0), with freq of:
                  1.0 = termFreq=1.0
                3.4794931 = idf(docFreq=3580, maxDocs=42740)
                0.078125 = fieldNorm(doc=3305)
          0.05037916 = weight(abstract_txt:method in 3305) [ClassicSimilarity], result of:
            0.05037916 = score(doc=3305,freq=3.0), product of:
              0.08233865 = queryWeight, product of:
                1.3817317 = boost
                4.5216455 = idf(docFreq=1262, maxDocs=42740)
                0.013179032 = queryNorm
              0.6118531 = fieldWeight in 3305, product of:
                1.7320508 = tf(freq=3.0), with freq of:
                  3.0 = termFreq=3.0
                4.5216455 = idf(docFreq=1262, maxDocs=42740)
                0.078125 = fieldNorm(doc=3305)
          0.18245503 = weight(abstract_txt:distributional in 3305) [ClassicSimilarity], result of:
            0.18245503 = score(doc=3305,freq=2.0), product of:
              0.17642528 = queryWeight, product of:
                1.4301686 = boost
                9.360306 = idf(docFreq=9, maxDocs=42740)
                0.013179032 = queryNorm
              1.0341774 = fieldWeight in 3305, product of:
                1.4142135 = tf(freq=2.0), with freq of:
                  2.0 = termFreq=2.0
                9.360306 = idf(docFreq=9, maxDocs=42740)
                0.078125 = fieldNorm(doc=3305)
          0.050385594 = weight(abstract_txt:measures in 3305) [ClassicSimilarity], result of:
            0.050385594 = score(doc=3305,freq=1.0), product of:
              0.11876299 = queryWeight, product of:
                1.6594433 = boost
                5.4304423 = idf(docFreq=508, maxDocs=42740)
                0.013179032 = queryNorm
              0.4242533 = fieldWeight in 3305, product of:
                1.0 = tf(freq=1.0), with freq of:
                  1.0 = termFreq=1.0
                5.4304423 = idf(docFreq=508, maxDocs=42740)
                0.078125 = fieldNorm(doc=3305)
          0.075572856 = weight(abstract_txt:extraction in 3305) [ClassicSimilarity], result of:
            0.075572856 = score(doc=3305,freq=1.0), product of:
              0.15561596 = queryWeight, product of:
                1.8995421 = boost
                6.216153 = idf(docFreq=231, maxDocs=42740)
                0.013179032 = queryNorm
              0.48563695 = fieldWeight in 3305, product of:
                1.0 = tf(freq=1.0), with freq of:
                  1.0 = termFreq=1.0
                6.216153 = idf(docFreq=231, maxDocs=42740)
                0.078125 = fieldNorm(doc=3305)
          0.34884822 = weight(abstract_txt:multiword in 3305) [ClassicSimilarity], result of:
            0.34884822 = score(doc=3305,freq=3.0), product of:
              0.2991306 = queryWeight, product of:
                2.6336148 = boost
                8.618368 = idf(docFreq=20, maxDocs=42740)
                0.013179032 = queryNorm
              1.1662071 = fieldWeight in 3305, product of:
                1.7320508 = tf(freq=3.0), with freq of:
                  3.0 = termFreq=3.0
                8.618368 = idf(docFreq=20, maxDocs=42740)
                0.078125 = fieldNorm(doc=3305)
        0.24 = coord(6/25)
    
  5. Rayson, P.; Piao, S.; Sharoff, S.; Evert, S.; Moiron, B.V.: Multiword expressions : hard going or plain sailing? (2015) 0.16
    0.15626234 = sum of:
      0.15626234 = product of:
        0.9766396 = sum of:
          0.07127725 = weight(abstract_txt:expressions in 4919) [ClassicSimilarity], result of:
            0.07127725 = score(doc=4919,freq=2.0), product of:
              0.094281 = queryWeight, product of:
                1.0454878 = boost
                6.842609 = idf(docFreq=123, maxDocs=42740)
                0.013179032 = queryNorm
              0.7560086 = fieldWeight in 4919, product of:
                1.4142135 = tf(freq=2.0), with freq of:
                  2.0 = termFreq=2.0
                6.842609 = idf(docFreq=123, maxDocs=42740)
                0.078125 = fieldNorm(doc=4919)
          0.3916348 = weight(abstract_txt:mwes in 4919) [ClassicSimilarity], result of:
            0.3916348 = score(doc=4919,freq=2.0), product of:
              0.36987454 = queryWeight, product of:
                2.928526 = boost
                9.583449 = idf(docFreq=7, maxDocs=42740)
                0.013179032 = queryNorm
              1.0588315 = fieldWeight in 4919, product of:
                1.4142135 = tf(freq=2.0), with freq of:
                  2.0 = termFreq=2.0
                9.583449 = idf(docFreq=7, maxDocs=42740)
                0.078125 = fieldNorm(doc=4919)
          0.28866535 = weight(abstract_txt:compositionality in 4919) [ClassicSimilarity], result of:
            0.28866535 = score(doc=4919,freq=1.0), product of:
              0.38025358 = queryWeight, product of:
                2.9693303 = boost
                9.71698 = idf(docFreq=6, maxDocs=42740)
                0.013179032 = queryNorm
              0.75913906 = fieldWeight in 4919, product of:
                1.0 = tf(freq=1.0), with freq of:
                  1.0 = termFreq=1.0
                9.71698 = idf(docFreq=6, maxDocs=42740)
                0.078125 = fieldNorm(doc=4919)
          0.22506222 = weight(abstract_txt:noun in 4919) [ClassicSimilarity], result of:
            0.22506222 = score(doc=4919,freq=1.0), product of:
              0.3687305 = queryWeight, product of:
                3.5811458 = boost
                7.8127427 = idf(docFreq=46, maxDocs=42740)
                0.013179032 = queryNorm
              0.6103705 = fieldWeight in 4919, product of:
                1.0 = tf(freq=1.0), with freq of:
                  1.0 = termFreq=1.0
                7.8127427 = idf(docFreq=46, maxDocs=42740)
                0.078125 = fieldNorm(doc=4919)
        0.16 = coord(4/25)